From dd60e39d0d37df65ed6aac131cdc93a1317d2782 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Fri, 16 Oct 2015 14:04:23 -0700 Subject: [PATCH 01/19] Implemented OpenMP in the PyClaw step3ds.f90 for the dimensioanlly split algorithm. The unsplit algorithm can also be updated. Usage requires setting the environment variable OMP_NUM_THREADS before program execution. In addition the gfortran flag '-fopenmp' and the f2py flag '-lgomp' are required when compiling step3ds.f90. Added F90 and f2py options for OpenMP (-fopenmp, and gomp => -lgomp) in classic/setup.py so that when compiling 'classic3.so' the openmp lines are compiled. --- src/pyclaw/classic/setup.py | 2 +- src/pyclaw/classic/solver.py | 37 +- src/pyclaw/classic/step3ds.f90 | 743 ++++++++++++++++++--------------- 3 files changed, 421 insertions(+), 361 deletions(-) diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py index 03b55838e..a50de681b 100644 --- a/src/pyclaw/classic/setup.py +++ b/src/pyclaw/classic/setup.py @@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None): ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet']) config.add_extension('classic3', - ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet']) + ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp']) return config diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index ab7277aef..53c3a023e 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -611,6 +611,12 @@ def __init__(self, riemann_solver=None, claw_package=None): self.aux3 = None self.work = None + import os + if os.environ['OMP_NUM_THREADS'] == None: + self.nthreads = 1 + else: + self.nthreads = int(os.environ['OMP_NUM_THREADS']) + super(ClawSolver3D,self).__init__(riemann_solver, claw_package) # ========== Setup routine ============================= @@ -636,10 +642,12 @@ def _allocate_workspace(self,solution): # These work arrays really ought to live inside a fortran module # as is done for sharpclaw - self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - mwork = (maxm+2*num_ghost) * (31*num_eqn + num_waves + num_eqn*num_waves) + print "nthreads:",self.nthreads + self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + mwork = (maxm+2*num_ghost)\ + *(31*num_eqn + num_waves + num_eqn*num_waves)*self.nthreads self.work = np.empty((mwork),order='F') @@ -676,17 +684,20 @@ def step_hyperbolic(self,solution): #Right now only Godunov-dimensional-splitting is implemented. #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack. - q, cfl_x = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,1,self.fwave,rpn3,rpt3,rptt3) + q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ + self.fwave,rpn3,rpt3,rptt3) - q, cfl_y = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,2,self.fwave,rpn3,rpt3,rptt3) + q, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\ + self.fwave,rpn3,rpt3,rptt3) - q, cfl_z = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,3,self.fwave,rpn3,rpt3,rptt3) + q, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ + self.fwave,rpn3,rpt3,rptt3) cfl = max(cfl_x,cfl_y,cfl_z) diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90 index c248a18ec..3b3ed70a6 100644 --- a/src/pyclaw/classic/step3ds.f90 +++ b/src/pyclaw/classic/step3ds.f90 @@ -1,61 +1,61 @@ +! ================================================================== +subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & + mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & + qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,& + num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3) +! ================================================================== + ! Take one time step, updating q, to be used with + ! dimensional splitting. + ! On entry, qold and qnew should be identical and give the + ! initial data for this step. + ! On exit, qnew returns values at the end of the time step. + ! qold is unchanged. + ! + ! This is a simplified version of the subroutine step3d + ! Since only qadd and fadd is used in the flux updates, + ! the solution updates below are considerably simplified + ! compared to step3d. + + !----------------------------------------------------------------- + ! NOTE! Since dimensional splitting is used, it is possible + ! to reduce the memory requirement, i.e. the size of + ! the work array. It could be reduced with + ! + ! (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux), + ! + ! when also possible reductions in flux3 are included. + ! However, this term is small compared to the dominating + ! term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost). + !----------------------------------------------------------------- + + !$ use omp_lib -! ================================================================== - subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, & - mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & - qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, & - aux1,aux2,aux3,num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3) -! ================================================================== + implicit real*8(a-h,o-z) + external rpn3,rpt3,rptt3 + dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension method(7),mthlim(num_waves) + dimension work(mwork) + logical :: use_fwave -! # Take one time step, updating q, to be used with -! # dimensional splitting. -! # On entry, qold and qnew should be identical and give the -! # initial data for this step. -! # On exit, qnew returns values at the end of the time step. -! # qold is unchanged. - -! # This is a simplified version of the subroutine step3d -! # Since only qadd and fadd is used in the flux updates, -! # the solution updates below are considerably simplified -! # compared to step3d. - -! #----------------------------------------------------------------- -! # NOTE! Since dimensional splitting is used, it is possible -! # to reduce the memory requirement, i.e. the size of -! # the work array. It could be reduced with -! # -! # (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux), -! # -! # when also possible reductions in flux3 are included. -! # However, this term is small compared to the dominating -! # term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost). -! #----------------------------------------------------------------- - - implicit real*8(a-h,o-z) - external rpn3,rpt3,rptt3 - dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost) - dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension dtdx1d(1-num_ghost:maxm+num_ghost) - dimension dtdy1d(1-num_ghost:maxm+num_ghost) - dimension dtdz1d(1-num_ghost:maxm+num_ghost) - dimension method(7),mthlim(num_waves) - dimension work(mwork) - logical :: use_fwave - - common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom + common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom !f2py intent(out) cfl !f2py intent(in,out) qnew @@ -67,317 +67,366 @@ subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, & !f2py x=rpt3(x) !f2py x=rptt3(x) -! # partition work array into pieces needed for local storage in -! # flux2 routine. Find starting index of each piece: + ! partition work array into pieces needed for local storage in + ! flux3 routine. Find starting index of each piece: - i0wave = 1 - i0s = i0wave + (maxm+2*num_ghost)*num_eqn*num_waves - i0amdq = i0s + (maxm+2*num_ghost)*num_waves - i0apdq = i0amdq + (maxm+2*num_ghost)*num_eqn - i0cqxx = i0apdq + (maxm+2*num_ghost)*num_eqn - i0bmamdq = i0cqxx + (maxm+2*num_ghost)*num_eqn - i0bmapdq = i0bmamdq + (maxm+2*num_ghost)*num_eqn - i0bpamdq = i0bmapdq + (maxm+2*num_ghost)*num_eqn - i0bpapdq = i0bpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq = i0bpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq = i0cmamdq + (maxm+2*num_ghost)*num_eqn - i0cpamdq = i0cmapdq + (maxm+2*num_ghost)*num_eqn - i0cpapdq = i0cpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq2 = i0cpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq2 = i0cmamdq2 + (maxm+2*num_ghost)*num_eqn - i0cpamdq2 = i0cmapdq2 + (maxm+2*num_ghost)*num_eqn - i0cpapdq2 = i0cpamdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxp = i0cpapdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxm = i0bmcqxxp + (maxm+2*num_ghost)*num_eqn - i0bpcqxxp = i0bmcqxxm + (maxm+2*num_ghost)*num_eqn - i0bpcqxxm = i0bpcqxxp + (maxm+2*num_ghost)*num_eqn - i0cmcqxxp = i0bpcqxxm + (maxm+2*num_ghost)*num_eqn - i0cmcqxxm = i0cmcqxxp + (maxm+2*num_ghost)*num_eqn - i0cpcqxxp = i0cmcqxxm + (maxm+2*num_ghost)*num_eqn - i0cpcqxxm = i0cpcqxxp + (maxm+2*num_ghost)*num_eqn - i0bmcmamdq = i0cpcqxxm + (maxm+2*num_ghost)*num_eqn - i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn - i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn - i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn - i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn - i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn - i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn - i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn - iused = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1 + nthreads=1 ! Serial + !$OMP PARALLEL + !$OMP SINGLE + !$ nthreads = omp_get_num_threads() + !$OMP END SINGLE + !$OMP END PARALLEL - if (iused > mwork) then - ! # This shouldn't happen due to checks in claw2 - write(6,*) '*** not enough work space in step2' - write(6,*) '*** iused = ', iused, ' mwork =',mwork - stop - endif + nsiz = (maxm+2*num_ghost)*num_eqn + nsiz_w = nsiz * num_waves + nsiz_s = (maxm+2*num_ghost)*num_waves - index_capa = method(6) -! num_aux = method(7) - cfl = 0.d0 - dtdx = dt/dx - dtdy = dt/dy - dtdz = dt/dz + i0wave = 1 + i0s = i0wave + nsiz_w*nthreads + i0amdq = i0s + nsiz_s*nthreads + i0apdq = i0amdq + nsiz*nthreads + i0cqxx = i0apdq + nsiz*nthreads + i0bmamdq = i0cqxx + nsiz*nthreads + i0bmapdq = i0bmamdq + nsiz*nthreads + i0bpamdq = i0bmapdq + nsiz*nthreads + i0bpapdq = i0bpamdq + nsiz*nthreads + i0cmamdq = i0bpapdq + nsiz*nthreads + i0cmapdq = i0cmamdq + nsiz*nthreads + i0cpamdq = i0cmapdq + nsiz*nthreads + i0cpapdq = i0cpamdq + nsiz*nthreads + i0cmamdq2 = i0cpapdq + nsiz*nthreads + i0cmapdq2 = i0cmamdq2 + nsiz*nthreads + i0cpamdq2 = i0cmapdq2 + nsiz*nthreads + i0cpapdq2 = i0cpamdq2 + nsiz*nthreads + i0bmcqxxp = i0cpapdq2 + nsiz*nthreads + i0bmcqxxm = i0bmcqxxp + nsiz*nthreads + i0bpcqxxp = i0bmcqxxm + nsiz*nthreads + i0bpcqxxm = i0bpcqxxp + nsiz*nthreads + i0cmcqxxp = i0bpcqxxm + nsiz*nthreads + i0cmcqxxm = i0cmcqxxp + nsiz*nthreads + i0cpcqxxp = i0cmcqxxm + nsiz*nthreads + i0cpcqxxm = i0cpcqxxp + nsiz*nthreads + i0bmcmamdq = i0cpcqxxm + nsiz*nthreads + i0bmcmapdq = i0bmcmamdq + nsiz*nthreads + i0bpcmamdq = i0bmcmapdq + nsiz*nthreads + i0bpcmapdq = i0bpcmamdq + nsiz*nthreads + i0bmcpamdq = i0bpcmapdq + nsiz*nthreads + i0bmcpapdq = i0bmcpamdq + nsiz*nthreads + i0bpcpamdq = i0bmcpapdq + nsiz*nthreads + i0bpcpapdq = i0bpcpamdq + nsiz*nthreads + iused = i0bpcpapdq + nsiz*nthreads - 1 - if (index_capa == 0) then - ! # no capa array: - do 5 i=1-num_ghost,maxm+num_ghost - dtdx1d(i) = dtdx - dtdy1d(i) = dtdy - dtdz1d(i) = dtdz - 5 END DO - endif + if (iused > mwork) then + ! This shouldn't happen due to checks in claw3 + write(6,*) '*** not enough work space in step3' + write(6,*) '*** iused = ', iused, ' mwork =',mwork + stop + endif + index_capa = method(6) + !num_aux = method(7) + cfl = 0.d0 + dtdx = dt/dx + dtdy = dt/dy + dtdz = dt/dz + + if (index_capa == 0) then + ! no capa array: + !$OMP PARALLEL PRIVATE(i,me,me1) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + do i=1-num_ghost,maxm+num_ghost + dtdx1d(i,me1) = dtdx + dtdy1d(i,me1) = dtdy + dtdz1d(i,me1) = dtdz + end do + !$OMP END PARALLEL + endif - if( idir == 1)then - - ! # perform x-sweeps - ! ================== + if( idir == 1)then - do 50 k = 0,mz+1 - do 50 j = 0,my+1 - - forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,i) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 23 i = 1-num_ghost, mx+num_ghost - dtdx1d(i) = dtdx / aux(index_capa,i,j,k) - 23 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1) - aux2(ma,i,2+ka) = aux(ma,i,j,k+ka) - end forall - endif - - ! # Store the value of j and k along this slice in the common block - ! # comxyt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - jcom = j - kcom = k - - ! # compute modifications qadd and fadd along this slice - - call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & - q1d,dtdx1d,dtdy,dtdz,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform x-sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,i,ma,ka,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + ! Guided schedule seems to produce a small performance increase + ! relative to static or dynamic for problems with uniform work + ! per column. For problems with very nonuniform work per + ! column, either guided or dynamic is necessary to keep all the + ! threads busy. + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do k = 0,mz+1 + do j = 0,my+1 + + forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,i,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do i = 1-num_ghost, mx+num_ghost + dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + if (num_aux > 0) then + forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1) + aux2(ma,i,2+ka,me1) = aux(ma,i,j,k+ka) + end forall + endif + + ! Store the value of j and k along this slice in the common block + ! comxyt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + jcom = j + kcom = k + + ! compute modifications qadd and fadd along this slice + + call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & + q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz,dummy1,& + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),& + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # (rather than maintaining arrays f, g and h for the total fluxes, - ! # the modifications are used immediately to update qnew - ! # in order to save storage.) - - if(index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) - end forall - else - ! # with capa array - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) & - / aux(index_capa,i,j,k) - end forall - endif - - 50 END DO - - else if( idir == 2 )then + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! (rather than maintaining arrays f, g and h for the total fluxes, + ! the modifications are used immediately to update qnew + ! in order to save storage.) + + if(index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, i = 1:mx) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) + end forall + else + ! with capa array + forall (m = 1:num_eqn, i = 1:mx) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - ! # perform y sweeps - ! ================== + else if( idir == 2 )then - do 100 k = 0, mz+1 - do 100 i = 0, mx+1 - - forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,j) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 71 j = 1-num_ghost, my+num_ghost - dtdy1d(j) = dtdy / aux(index_capa,i,j,k) - 71 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - ! There's a decent chance aux2 will all fit into cache, - ! so keep accesses to aux as contiguous as possible. - forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) - aux2(ma, j, 2+ia) = aux(ma, i+ia, j, k) - end forall - endif - - ! # Store the value of i and k along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - kcom = k - - ! # compute modifications qadd and fadd along this slice - - call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & - q1d,dtdy1d,dtdz,dtdx,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform y sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,j,ma,ia,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do k = 0, mz+1 + do i = 0, mx+1 + + forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,j,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do j = 1-num_ghost, my+num_ghost + dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + + if (num_aux > 0) then + ! There's a decent chance aux2 will all fit into cache, + ! so keep accesses to aux as contiguous as possible. + forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) + aux2(ma, j, 2+ia,me1) = aux(ma, i+ia, j, k) + end forall + endif + + ! Store the value of i and k along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + kcom = k + + ! compute modifications qadd and fadd along this slice + + call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & + q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx,dummy1,& + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),& + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the g-fluxes - - if( index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) - end forall - else - ! #with capa array. - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) & - / aux(index_capa,i,j,k) - end forall - endif - - 100 END DO - - else + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the g-fluxes + + if( index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, j = 1:my) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) + end forall + else + ! with capa array. + forall (m = 1:num_eqn, j = 1:my) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - ! # perform z sweeps - ! ================== + else - - do 150 j = 0, my+1 - do 150 i = 0, mx+1 - - forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,k) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 130 k = 1-num_ghost, mz+num_ghost - dtdz1d(k) = dtdz / aux(index_capa,i,j,k) - 130 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - ! There's a decent chance aux2 will all fit into cache, - ! so keep accesses to aux as contiguous as possible. - forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost) - aux2(ma, k, 2+ja) = aux(ma, i, j+ja, k) - end forall - endif - - ! # Store the value of i and j along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - jcom = j - - ! # compute modifications qadd and fadd along this slice - - call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & - q1d,dtdz1d,dtdx,dtdy,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform z sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,k,ma,ja,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do j = 0, my+1 + do i = 0, mx+1 + + forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,k,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do k = 1-num_ghost, mz+num_ghost + dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + + if (num_aux > 0) then + ! There's a decent chance aux2 will all fit into cache, + ! so keep accesses to aux as contiguous as possible. + forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost) + aux2(ma, k, 2+ja,me1) = aux(ma, i, j+ja, k) + end forall + endif + + ! Store the value of i and j along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + jcom = j + + ! compute modifications qadd and fadd along this slice + + call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & + q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy,dummy1, & + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1), & + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the h-fluxes - - if(index_capa == 0)then - ! #no capa array. Standard flux differencing: - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) - end forall - else - ! # with capa array - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) & - / aux(index_capa,i,j,k) - end forall - endif - - 150 END DO + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the h-fluxes + + if(index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, k = 1:mz) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) + end forall + else + ! with capa array + forall (m = 1:num_eqn, k = 1:mz) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - endif + endif - return - end subroutine step3ds + return +end subroutine step3ds From 2cffe8b4cd00e45b1bb9a4a67dbe8cb41b76b0c2 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Fri, 16 Oct 2015 14:16:32 -0700 Subject: [PATCH 02/19] Fixed bug when OMP_NUM_THREADS env variable is not set, and uses nthreads=1 --- src/pyclaw/classic/solver.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 53c3a023e..2eeb954c0 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -612,10 +612,7 @@ def __init__(self, riemann_solver=None, claw_package=None): self.work = None import os - if os.environ['OMP_NUM_THREADS'] == None: - self.nthreads = 1 - else: - self.nthreads = int(os.environ['OMP_NUM_THREADS']) + self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) super(ClawSolver3D,self).__init__(riemann_solver, claw_package) From 2bfff21e51030f5a8332565944aec669cec9fe32 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Fri, 16 Oct 2015 14:23:17 -0700 Subject: [PATCH 03/19] Added a comment related to a hack of adding and subtracting a variable in a dimension size in order to trick f2py into not making nthreads an optional argument. Otherwise it used nthreads=shape(qadd,2) as a size. Since qadd is optional, it get a bad value when qadd is not in the call to step3ds(), which is the default for pyclaw. --- src/pyclaw/classic/step3ds.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90 index 3b3ed70a6..7d672f3b6 100644 --- a/src/pyclaw/classic/step3ds.f90 +++ b/src/pyclaw/classic/step3ds.f90 @@ -38,7 +38,7 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) ! Adding and subtracting mx is a hack to trick f2py, so it does not make nthreads optional. dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) From 3bc6ba8e48f74b45d7a0d17291cd5f219611dc7d Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Mon, 19 Oct 2015 08:33:15 -0700 Subject: [PATCH 04/19] Update solver.py removed debugging print statement --- src/pyclaw/classic/solver.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 2eeb954c0..e5f07f378 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -639,7 +639,6 @@ def _allocate_workspace(self,solution): # These work arrays really ought to live inside a fortran module # as is done for sharpclaw - print "nthreads:",self.nthreads self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') From 7e21323cbf19553e893043216d5592ad8fa60f95 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Wed, 28 Oct 2015 23:26:11 -0700 Subject: [PATCH 05/19] Updated solver.py to set the default number of threads to the number of CPUs when the OMP_NUM_THREADS environment varibale is unset. This is consistent with what OpenMP returns for omp_get_num_threads() when the env variable is unset. Updated step3.f90 to use OpenMP, based on what is done in clawpack/classic. This was sucessfully tested with the Sedov.py test problem in the euler_3d example folder. --- src/pyclaw/classic/solver.py | 12 +- src/pyclaw/classic/step3.f90 | 1475 +++++++++++++++++++------------- src/pyclaw/classic/step3ds.f90 | 5 +- 3 files changed, 897 insertions(+), 595 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 2eeb954c0..888cb181a 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -612,7 +612,9 @@ def __init__(self, riemann_solver=None, claw_package=None): self.work = None import os - self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) + import multiprocessing + self.nthreads = int(os.environ.get('OMP_NUM_THREADS',\ + multiprocessing.cpu_count())) super(ClawSolver3D,self).__init__(riemann_solver, claw_package) @@ -639,7 +641,6 @@ def _allocate_workspace(self,solution): # These work arrays really ought to live inside a fortran module # as is done for sharpclaw - print "nthreads:",self.nthreads self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') @@ -700,9 +701,10 @@ def step_hyperbolic(self,solution): else: - q, cfl = self.fmod.step3(maxm,self.num_ghost,mx,my,mz, \ - qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,self.fwave,rpn3,rpt3,rptt3) + q, cfl = self.fmod.step3(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,\ + self.fwave,rpn3,rpt3,rptt3) self.cfl.update_global_max(cfl) state.set_q_from_qbc(self.num_ghost,self.qbc) diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90 index 211f626b8..ec8b91556 100644 --- a/src/pyclaw/classic/step3.f90 +++ b/src/pyclaw/classic/step3.f90 @@ -1,45 +1,50 @@ -! ================================================================== - subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, & - mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & - qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, & - aux1,aux2,aux3,num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3) -! ================================================================== - -! # Take one time step, updating q. -! # On entry, qold and qnew should be identical and give the -! # initial data for this step -! # On exit, qnew returns values at the end of the time step. -! # qold is unchanged. - -! # qadd is used to return increments to q from flux3. -! # fadd, gadd and hadd are used to return flux increments from flux3. -! # See the flux3 documentation for more information. - - - implicit real*8(a-h,o-z) - external rpn3,rpt3,rptt3 - dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost) - dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension dtdx1d(1-num_ghost:maxm+num_ghost) - dimension dtdy1d(1-num_ghost:maxm+num_ghost) - dimension dtdz1d(1-num_ghost:maxm+num_ghost) - dimension method(7),mthlim(num_waves) - dimension work(mwork) - logical :: use_fwave - +! ================================================================== +subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & + mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & + qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,& + num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3) +! ================================================================== + + ! Take one time step, updating q. + ! On entry, qold and qnew should be identical and give the + ! initial data for this step + ! On exit, qnew returns values at the end of the time step. + ! qold is unchanged. + ! + ! qadd is used to return increments to q from flux3. + ! fadd, gadd and hadd are used to return flux increments from flux3. + ! See the flux3 documentation for more information. + + !$ use omp_lib + + implicit real*8(a-h,o-z) + external rpn3,rpt3,rptt3 + dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension method(7),mthlim(num_waves) + dimension work(mwork) + logical :: use_fwave + + ! These block sizes must be at least 2 to avoid multiple threads + ! trying to update the same grid cell + integer, parameter :: blksiz_i = 2, blksiz_j = 2, blksiz_k = 2 + !f2py intent(out) cfl !f2py intent(in,out) qnew !f2py optional q1d, qadd, fadd, gadd, hadd, dtdx1d, dtdy1d, dtdz1d @@ -50,552 +55,846 @@ subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, & !f2py x=rpt3(x) !f2py x=rptt3(x) - common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom - - - -! # partition work array into pieces needed for local storage in -! # flux3 routine. Find starting index of each piece: - - i0wave = 1 - i0s = i0wave + (maxm+2*num_ghost)*num_eqn*num_waves - i0amdq = i0s + (maxm+2*num_ghost)*num_waves - i0apdq = i0amdq + (maxm+2*num_ghost)*num_eqn - i0cqxx = i0apdq + (maxm+2*num_ghost)*num_eqn - i0bmamdq = i0cqxx + (maxm+2*num_ghost)*num_eqn - i0bmapdq = i0bmamdq + (maxm+2*num_ghost)*num_eqn - i0bpamdq = i0bmapdq + (maxm+2*num_ghost)*num_eqn - i0bpapdq = i0bpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq = i0bpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq = i0cmamdq + (maxm+2*num_ghost)*num_eqn - i0cpamdq = i0cmapdq + (maxm+2*num_ghost)*num_eqn - i0cpapdq = i0cpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq2 = i0cpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq2 = i0cmamdq2 + (maxm+2*num_ghost)*num_eqn - i0cpamdq2 = i0cmapdq2 + (maxm+2*num_ghost)*num_eqn - i0cpapdq2 = i0cpamdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxp = i0cpapdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxm = i0bmcqxxp + (maxm+2*num_ghost)*num_eqn - i0bpcqxxp = i0bmcqxxm + (maxm+2*num_ghost)*num_eqn - i0bpcqxxm = i0bpcqxxp + (maxm+2*num_ghost)*num_eqn - i0cmcqxxp = i0bpcqxxm + (maxm+2*num_ghost)*num_eqn - i0cmcqxxm = i0cmcqxxp + (maxm+2*num_ghost)*num_eqn - i0cpcqxxp = i0cmcqxxm + (maxm+2*num_ghost)*num_eqn - i0cpcqxxm = i0cpcqxxp + (maxm+2*num_ghost)*num_eqn - i0bmcmamdq = i0cpcqxxm + (maxm+2*num_ghost)*num_eqn - i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn - i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn - i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn - i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn - i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn - i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn - i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn - iused = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1 - - if (iused > mwork) then - ! # This shouldn't happen due to checks in claw3 - write(6,*) '*** not enough work space in step3' - write(6,*) '*** iused = ', iused, ' mwork =',mwork - stop - endif - - - - index_capa = method(6) -! num_aux = method(7) - cfl = 0.d0 - dtdx = dt/dx - dtdy = dt/dy - dtdz = dt/dz - - if (index_capa == 0) then - ! # no capa array: - do 5 i=1-num_ghost,maxm+num_ghost - dtdx1d(i) = dtdx - dtdy1d(i) = dtdy - dtdz1d(i) = dtdz - 5 END DO - endif - - -! # perform x-sweeps -! ================== - - - do 50 k = 0,mz+1 - do 50 j = 0,my+1 - - forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,i) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 23 i = 1-num_ghost, mx+num_ghost - dtdx1d(i) = dtdx / aux(index_capa,i,j,k) - 23 END DO - endif - - if (num_aux > 0) then - ! This odd construct may help improve cache locality. - ! (The F95 standard says each statement in the FORALL - ! must be executed for all indices before the next - ! statement is started, so this is different semantically - ! from putting all three indices in the same FORALL.) - forall (ka = -1:1) - forall (ma = 1:num_aux, i = 1-num_ghost:mx+num_ghost) - aux1(ma, i, 2+ka) = aux(ma, i, j-1, k+ka) - aux2(ma, i, 2+ka) = aux(ma, i, j, k+ka) - aux3(ma, i, 2+ka) = aux(ma, i, j+1, k+ka) - end forall - end forall - endif - - ! # Store the value of j and k along this slice in the common block - ! # comxyt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - jcom = j - kcom = k - - ! # compute modifications fadd, gadd and hadd to fluxes along - ! # this slice: - - call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & - q1d,dtdx1d,dtdy,dtdz,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # (rather than maintaining arrays f, g and h for the total fluxes, - ! # the modifications are used immediately to update qnew - ! # in order to save storage.) - - if(index_capa == 0)then - - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) & - - dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) & - - dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i)) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - dtdy * gadd(m,1,0,i) & - - dtdz * ( hadd(m,2,-1,i) & - - hadd(m,1,-1,i) ) - qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & - - dtdy * gadd(m,1,-1,i) & - - dtdz * hadd(m,1,-1,i) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - dtdy * ( gadd(m,2,-1,i) & - - gadd(m,1,-1,i) ) & - - dtdz * hadd(m,1,0,i) - qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & - + dtdy * gadd(m,2,-1,i) & - - dtdz * hadd(m,1,1,i) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - + dtdy * gadd(m,2,0,i) & - - dtdz * ( hadd(m,2,1,i) & - - hadd(m,1,1,i) ) - qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & - + dtdy * gadd(m,2,1,i) & - + dtdz * hadd(m,2,1,i) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - - dtdy * ( gadd(m,2,1,i) & - - gadd(m,1,1,i) ) & - + dtdz * hadd(m,2,0,i) - qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & - - dtdy * gadd(m,1,1,i) & - + dtdz * hadd(m,2,-1,i) - end forall - - else - ! # with capa array - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - (dtdx * (fadd(m,i+1) - fadd(m,i)) & - + dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) & - + dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i))) & - / aux(index_capa,i,j,k) - - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - (dtdy * gadd(m,1,0,i) & - + dtdz * ( hadd(m,2,-1,i) & - - hadd(m,1,-1,i) )) & - / aux(index_capa,i,j-1,k) - qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & - - (dtdy * gadd(m,1,-1,i) & - + dtdz * hadd(m,1,-1,i)) & - / aux(index_capa,i,j-1,k-1) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - (dtdy * ( gadd(m,2,-1,i) & - - gadd(m,1,-1,i) ) & - + dtdz * hadd(m,1,0,i)) & - / aux(index_capa,i,j,k-1) - qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & - + (dtdy * gadd(m,2,-1,i) & - - dtdz * hadd(m,1,1,i)) & - / aux(index_capa,i,j+1,k-1) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - + (dtdy * gadd(m,2,0,i) & - - dtdz * ( hadd(m,2,1,i) & - - hadd(m,1,1,i) )) & - / aux(index_capa,i,j+1,k) - qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & - + (dtdy * gadd(m,2,1,i) & - + dtdz * hadd(m,2,1,i)) & - / aux(index_capa,i,j+1,k+1) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - - (dtdy * ( gadd(m,2,1,i) & - - gadd(m,1,1,i) ) & - - dtdz * hadd(m,2,0,i)) & - / aux(index_capa,i,j,k+1) - qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & - - (dtdy * gadd(m,1,1,i) & - - dtdz * hadd(m,2,-1,i)) & - / aux(index_capa,i,j-1,k+1) - end forall - endif - - 50 END DO - - - -! # perform y sweeps -! ================== - - - do 100 k = 0, mz+1 - do 100 i = 0, mx+1 - - forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,j) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 71 j = 1-num_ghost, my+num_ghost - dtdy1d(j) = dtdy / aux(index_capa,i,j,k) - 71 END DO - endif - - if (num_aux > 0) then - ! aux1, aux2, aux3 probably fit in cache, so optimize - ! access to aux - forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) - aux1(ma, j, 2+ia) = aux(ma, i+ia, j, k-1) - aux2(ma, j, 2+ia) = aux(ma, i+ia, j, k) - aux3(ma, j, 2+ia) = aux(ma, i+ia, j, k+1) - end forall - endif - - ! # Store the value of i and k along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - kcom = k - - ! # compute modifications fadd, gadd and hadd to fluxes along this - ! # slice: - - call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & - q1d,dtdy1d,dtdz,dtdx,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the g-fluxes - ! # gadd - modifies the h-fluxes - ! # hadd - modifies the f-fluxes - - if( index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) & - - dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) & - - dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j)) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - + dtdz * gadd(m,2,0,j) & - - dtdx * ( hadd(m,2,1,j) & - - hadd(m,1,1,j) ) - qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & - + dtdz * gadd(m,2,1,j) & - + dtdx * hadd(m,2,1,j) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - - dtdz * ( gadd(m,2,1,j) & - - gadd(m,1,1,j) ) & - + dtdx * hadd(m,2,0,j) - qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & - - dtdz * gadd(m,1,1,j) & - + dtdx * hadd(m,2,-1,j) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - dtdz * gadd(m,1,0,j) & - - dtdx * ( hadd(m,2,-1,j) & - - hadd(m,1,-1,j) ) - qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & - - dtdz * gadd(m,1,-1,j) & - - dtdx * hadd(m,1,-1,j) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - dtdz * ( gadd(m,2,-1,j) & - - gadd(m,1,-1,j) ) & - - dtdx * hadd(m,1,0,j) - qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & - + dtdz * gadd(m,2,-1,j) & - - dtdx*hadd(m,1,1,j) - end forall - else - - ! #with capa array. - - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - (dtdy * (fadd(m,j+1) - fadd(m,j)) & - + dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) & - + dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j))) & - / aux(index_capa,i,j,k) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - + (dtdz * gadd(m,2,0,j) & - - dtdx * ( hadd(m,2,1,j) & - - hadd(m,1,1,j) )) & - / aux(index_capa,i,j,k+1) - qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & - + (dtdz * gadd(m,2,1,j) & - + dtdx * hadd(m,2,1,j)) & - / aux(index_capa,i+1,j,k+1) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - - (dtdz * ( gadd(m,2,1,j) & - - gadd(m,1,1,j) ) & - - dtdx * hadd(m,2,0,j) ) & - / aux(index_capa,i+1,j,k) - qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & - - (dtdz * gadd(m,1,1,j) & - - dtdx * hadd(m,2,-1,j)) & - / aux(index_capa,i+1,j,k-1) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - (dtdz * gadd(m,1,0,j) & - + dtdx * ( hadd(m,2,-1,j) & - - hadd(m,1,-1,j) )) & - / aux(index_capa,i,j,k-1) - qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & - - (dtdz * gadd(m,1,-1,j) & - + dtdx * hadd(m,1,-1,j)) & - / aux(index_capa,i-1,j,k-1) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - (dtdz * ( gadd(m,2,-1,j) & - - gadd(m,1,-1,j) ) & - + dtdx * hadd(m,1,0,j)) & - / aux(index_capa,i-1,j,k) - qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & - + (dtdz * gadd(m,2,-1,j) & - - dtdx*hadd(m,1,1,j)) & - / aux(index_capa,i-1,j,k+1) - end forall - endif - - 100 END DO - - - -! # perform z sweeps -! ================== - - - do 150 j = 0, my+1 - do 150 i = 0, mx+1 - - forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,k) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 130 k = 1-num_ghost, mz+num_ghost - dtdz1d(k) = dtdz / aux(index_capa,i,j,k) - 130 END DO - endif - - if (num_aux > 0) then - ! See the comment on the X sweeps. This is semantically - ! slightly different than putting all the indices in the - ! same forall, and hopefully better for optimizing access - ! to aux. - forall (ja = -1:1, k = 1-num_ghost:mz+num_ghost) - forall (ma = 1:num_aux) - aux1(ma, k, 2+ja) = aux(ma, i-1, j+ja, k) - aux2(ma, k, 2+ja) = aux(ma, i, j+ja, k) - aux3(ma, k, 2+ja) = aux(ma, i+1, j+ja, k) - end forall - end forall - endif - - ! # Store the value of i and j along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - jcom = j - - ! # compute modifications fadd, gadd and hadd to fluxes along this - ! # slice: - - call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & - q1d,dtdz1d,dtdx,dtdy,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the h-fluxes - ! # gadd - modifies the f-fluxes - ! # hadd - modifies the g-fluxes - - if(index_capa == 0)then - - ! #no capa array. Standard flux differencing: - - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) & - - dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) & - - dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k)) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - - dtdx * ( gadd(m,2,1,k) & - - gadd(m,1,1,k) ) & - + dtdy * hadd(m,2,0,k) - qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & - + dtdx * gadd(m,2,1,k) & - + dtdy * hadd(m,2,1,k) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - + dtdx * gadd(m,2,0,k) & - - dtdy * ( hadd(m,2,1,k) & - - hadd(m,1,1,k) ) - qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & - + dtdx * gadd(m,2,-1,k) & - - dtdy * hadd(m,1,1,k) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - dtdx * ( gadd(m,2,-1,k) & - - gadd(m,1,-1,k) ) & - - dtdy * hadd(m,1,0,k) - qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & - - dtdx * gadd(m,1,-1,k) & - - dtdy * hadd(m,1,-1,k) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - dtdx * gadd(m,1,0,k) & - - dtdy * ( hadd(m,2,-1,k) & - - hadd(m,1,-1,k) ) - qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & - - dtdx * gadd(m,1,1,k) & - + dtdy * hadd(m,2,-1,k) - end forall - else - - ! # with capa array - - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - (dtdz * (fadd(m,k+1) - fadd(m,k)) & - + dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) & - + dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k))) & - / aux(index_capa,i,j,k) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - - (dtdx * ( gadd(m,2,1,k) & - - gadd(m,1,1,k) ) & - - dtdy * hadd(m,2,0,k)) & - / aux(index_capa,i,j+1,k) - qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & - + (dtdx * gadd(m,2,1,k) & - + dtdy * hadd(m,2,1,k)) & - / aux(index_capa,i+1,j+1,k) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - + (dtdx * gadd(m,2,0,k) & - - dtdy * ( hadd(m,2,1,k) & - - hadd(m,1,1,k) )) & - / aux(index_capa,i+1,j,k) - qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & - + (dtdx * gadd(m,2,-1,k) & - - dtdy * hadd(m,1,1,k)) & - / aux(index_capa,i+1,j-1,k) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - (dtdx * ( gadd(m,2,-1,k) & - - gadd(m,1,-1,k) ) & - + dtdy * hadd(m,1,0,k)) & - / aux(index_capa,i,j-1,k) - qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & - - (dtdx * gadd(m,1,-1,k) & - + dtdy * hadd(m,1,-1,k)) & - / aux(index_capa,i-1,j-1,k) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - (dtdx * gadd(m,1,0,k) & - + dtdy * ( hadd(m,2,-1,k) & - - hadd(m,1,-1,k) )) & - / aux(index_capa,i-1,j,k) - qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & - - (dtdx * gadd(m,1,1,k) & - - dtdy * hadd(m,2,-1,k)) & - / aux(index_capa,i-1,j+1,k) - end forall - endif - - 150 END DO - - - return - end subroutine step3 + common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom + + ! partition work array into pieces needed for local storage in + ! flux3 routine. Find starting index of each piece: + + nthreads=1 ! Serial + !$omp parallel + !$omp single + !$ nthreads = omp_get_num_threads() + !$omp end single + !$omp end parallel + + nsiz = (maxm+2*num_ghost)*num_eqn + nsiz_w = nsiz * num_waves + nsiz_s = (maxm+2*num_ghost)*num_waves + + i0wave = 1 + i0s = i0wave + nsiz_w*nthreads + i0amdq = i0s + nsiz_s*nthreads + i0apdq = i0amdq + nsiz*nthreads + i0cqxx = i0apdq + nsiz*nthreads + i0bmamdq = i0cqxx + nsiz*nthreads + i0bmapdq = i0bmamdq + nsiz*nthreads + i0bpamdq = i0bmapdq + nsiz*nthreads + i0bpapdq = i0bpamdq + nsiz*nthreads + i0cmamdq = i0bpapdq + nsiz*nthreads + i0cmapdq = i0cmamdq + nsiz*nthreads + i0cpamdq = i0cmapdq + nsiz*nthreads + i0cpapdq = i0cpamdq + nsiz*nthreads + i0cmamdq2 = i0cpapdq + nsiz*nthreads + i0cmapdq2 = i0cmamdq2 + nsiz*nthreads + i0cpamdq2 = i0cmapdq2 + nsiz*nthreads + i0cpapdq2 = i0cpamdq2 + nsiz*nthreads + i0bmcqxxp = i0cpapdq2 + nsiz*nthreads + i0bmcqxxm = i0bmcqxxp + nsiz*nthreads + i0bpcqxxp = i0bmcqxxm + nsiz*nthreads + i0bpcqxxm = i0bpcqxxp + nsiz*nthreads + i0cmcqxxp = i0bpcqxxm + nsiz*nthreads + i0cmcqxxm = i0cmcqxxp + nsiz*nthreads + i0cpcqxxp = i0cmcqxxm + nsiz*nthreads + i0cpcqxxm = i0cpcqxxp + nsiz*nthreads + i0bmcmamdq = i0cpcqxxm + nsiz*nthreads + i0bmcmapdq = i0bmcmamdq + nsiz*nthreads + i0bpcmamdq = i0bmcmapdq + nsiz*nthreads + i0bpcmapdq = i0bpcmamdq + nsiz*nthreads + i0bmcpamdq = i0bpcmapdq + nsiz*nthreads + i0bmcpapdq = i0bmcpamdq + nsiz*nthreads + i0bpcpamdq = i0bmcpapdq + nsiz*nthreads + i0bpcpapdq = i0bpcpamdq + nsiz*nthreads + iused = i0bpcpapdq + nsiz*nthreads - 1 + + if (iused > mwork) then + ! This shouldn't happen due to checks in claw3 + write(6,*) '*** not enough work space in step3()' + write(6,*) '*** iused = ', iused, ' mwork =',mwork + write(6,*) '*** nthreads = ',nthreads + stop + endif + + index_capa = method(6) + !num_aux = method(7) + cfl = 0.d0 + dtdx = dt/dx + dtdy = dt/dy + dtdz = dt/dz + + ! Calculate the number of blocks to use in each grid direction + ! (integer division, rounding up) + nblk_i = (mx + blksiz_i + 1)/blksiz_i + nblk_j = (my + blksiz_j + 1)/blksiz_j + nblk_k = (mz + blksiz_k + 1)/blksiz_k + + !$omp parallel private(me,me1,m,i,j,k,ma,ia,ja,ka,cfl1d,ioffset,joffset,koffset, & + !$omp iblk,jblk,kblk) reduction(max:cfl) + + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + + if (index_capa == 0) then + ! no capa array: + do i=1-num_ghost,maxm+num_ghost + dtdx1d(i,me1) = dtdx + dtdy1d(i,me1) = dtdy + dtdz1d(i,me1) = dtdz + end do + endif + + ! perform x-sweeps + ! ================== + + ! This block-based approach keeps race conditions from happening + ! with OpenMP parallelization. Because the results of the flux3 + ! call affect the column flux3 was called on and all adjacent + ! columns, an easy way to keep threads from interfering with each + ! other is to make sure that every thread takes a column that is + ! at least three cells away from all the other threads in each + ! direction. The code here accomplishes this by breaking the + ! columns into blocks; for a block size of 2 in all directions, + ! the breakdown looks like: + ! + ! xx..xx.. + ! xx..xx.. + ! **++**++ + ! **++**++ + ! xx..xx.. + ! xx..xx.. + ! **++**++ + ! **++**++ + ! + ! All the blocks of columns labeled with the same symbol (e.g. x) + ! are done in parallel in one pass; within each block, the work is + ! all done by one thread, avoiding race conditions. Then the code + ! flushes memory and synchronizes, and starts the next set of + ! blocks (e.g. *). + ! + ! Breaking up into blocks in this fashion helps provide a large + ! number of small, independent work items, which helps with + ! parallel performance. + + do koffset=0,1 + do joffset=0,1 + + ! Guided or dynamic scheduling is necessary to keep all the + ! threads busy if the work per column is very nonuniform. With + ! many blocks to process, guided is probably the best choice. + + !$omp do collapse(2) schedule(guided) + do kblk = koffset, nblk_k-1, 2 + do jblk = joffset, nblk_j-1, 2 + + do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1) + do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1) + + ! copy data along a slice into 1d array: + do i = 1-num_ghost, mx+num_ghost + do m = 1, num_eqn + q1d(m,i,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do i = 1-num_ghost, mx+num_ghost + dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux1(ma, i, 2+ka, me1) = aux(ma, i, j-1, k+ka) + end do + end do + end do + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux2(ma, i, 2+ka, me1) = aux(ma, i, j, k+ka) + end do + end do + end do + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux3(ma, i, 2+ka, me1) = aux(ma, i, j+1, k+ka) + end do + end do + end do + endif + + ! Store the value of j and k along this slice in the common block + ! comxyt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + jcom = j + kcom = k + + ! compute modifications fadd, gadd and hadd to fluxes along + ! this slice: + + call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & + q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),& + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),& + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),& + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! (rather than maintaining arrays f, g and h for + ! the total fluxes, the modifications are used immediately + ! to update qnew in order to save storage.) + + if(index_capa == 0)then + ! Order so that access to qnew is in memory-contiguous + ! order as much as possible + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & + - dtdy * gadd(m,1,-1,i,me1) & + - dtdz * hadd(m,1,-1,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - dtdy * ( gadd(m,2,-1,i,me1) & + - gadd(m,1,-1,i,me1) ) & + - dtdz * hadd(m,1,0,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & + + dtdy * gadd(m,2,-1,i,me1) & + - dtdz * hadd(m,1,1,i,me1) + end do + end do + + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - dtdy * gadd(m,1,0,i,me1) & + - dtdz * ( hadd(m,2,-1,i,me1) & + - hadd(m,1,-1,i,me1) ) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + - dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) & + - dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1)) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + + dtdy * gadd(m,2,0,i,me1) & + - dtdz * ( hadd(m,2,1,i,me1) & + - hadd(m,1,1,i,me1) ) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & + - dtdy * gadd(m,1,1,i,me1) & + + dtdz * hadd(m,2,-1,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + - dtdy * ( gadd(m,2,1,i,me1) & + - gadd(m,1,1,i,me1) ) & + + dtdz * hadd(m,2,0,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & + + dtdy * gadd(m,2,1,i,me1) & + + dtdz * hadd(m,2,1,i,me1) + end do + end do + else + ! with capa array + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & + - (dtdy * gadd(m,1,-1,i,me1) & + + dtdz * hadd(m,1,-1,i,me1)) & + / aux(index_capa,i,j-1,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - (dtdy * ( gadd(m,2,-1,i,me1) & + - gadd(m,1,-1,i,me1) ) & + + dtdz * hadd(m,1,0,i,me1)) & + / aux(index_capa,i,j,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & + + (dtdy * gadd(m,2,-1,i,me1) & + - dtdz * hadd(m,1,1,i,me1)) & + / aux(index_capa,i,j+1,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - (dtdy * gadd(m,1,0,i,me1) & + + dtdz * ( hadd(m,2,-1,i,me1) & + - hadd(m,1,-1,i,me1) )) & + / aux(index_capa,i,j-1,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - (dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + + dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) & + + dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1)))& + / aux(index_capa,i,j,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + + (dtdy * gadd(m,2,0,i,me1) & + - dtdz * ( hadd(m,2,1,i,me1) & + - hadd(m,1,1,i,me1) )) & + / aux(index_capa,i,j+1,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & + - (dtdy * gadd(m,1,1,i,me1) & + - dtdz * hadd(m,2,-1,i,me1)) & + / aux(index_capa,i,j-1,k+1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + - (dtdy * ( gadd(m,2,1,i,me1) & + - gadd(m,1,1,i,me1) ) & + - dtdz * hadd(m,2,0,i,me1)) & + / aux(index_capa,i,j,k+1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & + + (dtdy * gadd(m,2,1,i,me1) & + + dtdz * hadd(m,2,1,i,me1)) & + / aux(index_capa,i,j+1,k+1) + end do + end do + endif + + end do + end do ! End loops within block + + end do + end do ! End loops over blocks + + ! Flush memory (may not be necessary), then make sure everybody + ! synchronizes before the next offset + + !$omp flush + !$omp barrier + end do + end do ! End loops over offsets + + ! perform y sweeps + ! ================== + + do koffset=0,1 + do ioffset=0,1 + + !$omp do collapse(2) schedule(guided) + do kblk = koffset, nblk_k-1, 2 + do iblk = ioffset, nblk_i-1, 2 + + do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1) + do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1) + + ! copy data along a slice into 1d array: + do j = 1-num_ghost, my+num_ghost + do m = 1, num_eqn + q1d(m,j,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do j = 1-num_ghost, my+num_ghost + dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + ! aux1, aux2, aux3 probably fit in cache, so optimize + ! access to aux + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux1(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k-1) + end do + end do + end do + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux2(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k) + end do + end do + end do + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux3(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k+1) + end do + end do + end do + endif + + ! Store the value of i and k along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + kcom = k + + ! compute modifications fadd, gadd and hadd to fluxes along this + ! slice: + + call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & + q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),& + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),& + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),& + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the g-fluxes + ! gadd - modifies the h-fluxes + ! hadd - modifies the f-fluxes + + if( index_capa == 0)then + ! no capa array. Standard flux differencing: + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & + - dtdz * gadd(m,1,-1,j,me1) & + - dtdx * hadd(m,1,-1,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - dtdz * gadd(m,1,0,j,me1) & + - dtdx * ( hadd(m,2,-1,j,me1) & + - hadd(m,1,-1,j,me1) ) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & + - dtdz * gadd(m,1,1,j,me1) & + + dtdx * hadd(m,2,-1,j,me1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - dtdz * ( gadd(m,2,-1,j,me1) & + - gadd(m,1,-1,j,me1) ) & + - dtdx * hadd(m,1,0,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + - dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) & + - dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1)) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + - dtdz * ( gadd(m,2,1,j,me1) & + - gadd(m,1,1,j,me1) ) & + + dtdx * hadd(m,2,0,j,me1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & + + dtdz * gadd(m,2,-1,j,me1) & + - dtdx*hadd(m,1,1,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + + dtdz * gadd(m,2,0,j,me1) & + - dtdx * ( hadd(m,2,1,j,me1) & + - hadd(m,1,1,j,me1) ) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & + + dtdz * gadd(m,2,1,j,me1) & + + dtdx * hadd(m,2,1,j,me1) + end do + end do + else + ! with capa array. + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & + - (dtdz * gadd(m,1,-1,j,me1) & + + dtdx * hadd(m,1,-1,j,me1)) & + / aux(index_capa,i-1,j,k-1) + end do + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - (dtdz * gadd(m,1,0,j,me1) & + + dtdx * ( hadd(m,2,-1,j,me1) & + - hadd(m,1,-1,j,me1) )) & + / aux(index_capa,i,j,k-1) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & + - (dtdz * gadd(m,1,1,j,me1) & + - dtdx * hadd(m,2,-1,j,me1)) & + / aux(index_capa,i+1,j,k-1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - (dtdz * ( gadd(m,2,-1,j,me1) & + - gadd(m,1,-1,j,me1) ) & + + dtdx * hadd(m,1,0,j,me1)) & + / aux(index_capa,i-1,j,k) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - (dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + + dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) & + + dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1)))& + / aux(index_capa,i,j,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + - (dtdz * ( gadd(m,2,1,j,me1) & + - gadd(m,1,1,j,me1) ) & + - dtdx * hadd(m,2,0,j,me1) ) & + / aux(index_capa,i+1,j,k) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & + + (dtdz * gadd(m,2,-1,j,me1) & + - dtdx*hadd(m,1,1,j,me1)) & + / aux(index_capa,i-1,j,k+1) + end do + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + + (dtdz * gadd(m,2,0,j,me1) & + - dtdx * ( hadd(m,2,1,j,me1) & + - hadd(m,1,1,j,me1) )) & + / aux(index_capa,i,j,k+1) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & + + (dtdz * gadd(m,2,1,j,me1) & + + dtdx * hadd(m,2,1,j,me1)) & + / aux(index_capa,i+1,j,k+1) + end do + end do + endif + + end do + end do ! End loops within blocks + + end do + end do ! End loops over blocks + + !$omp flush + !$omp barrier + end do + end do ! End loops over offsets + + ! perform z sweeps + ! ================== + + do joffset=0,1 + do ioffset=0,1 + + !$omp do collapse(2) schedule(guided) + do jblk = joffset, nblk_j-1, 2 + do iblk = ioffset, nblk_i-1, 2 + + do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1) + do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1) + + do k = 1-num_ghost, mz+num_ghost + do m = 1, num_eqn + q1d(m,k,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do k = 1-num_ghost, mz+num_ghost + dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + do k = 1-num_ghost, mz+num_ghost + do ja = -1, 1 + do ma = 1, num_aux + aux1(ma, k, 2+ja, me1) = aux(ma, i-1, j+ja, k) + end do + do ma = 1, num_aux + aux2(ma, k, 2+ja, me1) = aux(ma, i, j+ja, k) + end do + do ma = 1, num_aux + aux3(ma, k, 2+ja, me1) = aux(ma, i+1, j+ja, k) + end do + end do + end do + endif + + + + ! Store the value of i and j along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + jcom = j + + ! compute modifications fadd, gadd and hadd to fluxes along this + ! slice: + + call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & + q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1), & + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1), & + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s), & + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the h-fluxes + ! gadd - modifies the f-fluxes + ! hadd - modifies the g-fluxes + + if(index_capa == 0)then + + ! no capa array. Standard flux differencing: + do k = 1, mz + do m = 1, num_eqn + qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & + - dtdx * gadd(m,1,-1,k,me1) & + - dtdy * hadd(m,1,-1,k,me1) + end do + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - dtdx * ( gadd(m,2,-1,k,me1) & + - gadd(m,1,-1,k,me1) ) & + - dtdy * hadd(m,1,0,k,me1) + end do + do m = 1, num_eqn + qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & + + dtdx * gadd(m,2,-1,k,me1) & + - dtdy * hadd(m,1,1,k,me1) + end do + + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - dtdx * gadd(m,1,0,k,me1) & + - dtdy * ( hadd(m,2,-1,k,me1) & + - hadd(m,1,-1,k,me1) ) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + - dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) & + - dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1)) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + + dtdx * gadd(m,2,0,k,me1) & + - dtdy * ( hadd(m,2,1,k,me1) & + - hadd(m,1,1,k,me1) ) + end do + + do m = 1, num_eqn + qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & + - dtdx * gadd(m,1,1,k,me1) & + + dtdy * hadd(m,2,-1,k,me1) + end do + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + - dtdx * ( gadd(m,2,1,k,me1) & + - gadd(m,1,1,k,me1) ) & + + dtdy * hadd(m,2,0,k,me1) + end do + do m = 1, num_eqn + qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & + + dtdx * gadd(m,2,1,k,me1) & + + dtdy * hadd(m,2,1,k,me1) + end do + end do + else + + ! with capa array + do k = 1, mz + do m = 1, num_eqn + qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & + - (dtdx * gadd(m,1,-1,k,me1) & + + dtdy * hadd(m,1,-1,k,me1)) & + / aux(index_capa,i-1,j-1,k) + end do + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - (dtdx * ( gadd(m,2,-1,k,me1) & + - gadd(m,1,-1,k,me1) ) & + + dtdy * hadd(m,1,0,k,me1)) & + / aux(index_capa,i,j-1,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & + + (dtdx * gadd(m,2,-1,k,me1) & + - dtdy * hadd(m,1,1,k,me1)) & + / aux(index_capa,i+1,j-1,k) + end do + + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - (dtdx * gadd(m,1,0,k,me1) & + + dtdy * ( hadd(m,2,-1,k,me1) & + - hadd(m,1,-1,k,me1) )) & + / aux(index_capa,i-1,j,k) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - (dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + + dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) & + + dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1)))& + / aux(index_capa,i,j,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + + (dtdx * gadd(m,2,0,k,me1) & + - dtdy * ( hadd(m,2,1,k,me1) & + - hadd(m,1,1,k,me1) )) & + / aux(index_capa,i+1,j,k) + end do + + do m = 1, num_eqn + qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & + - (dtdx * gadd(m,1,1,k,me1) & + - dtdy * hadd(m,2,-1,k,me1)) & + / aux(index_capa,i-1,j+1,k) + end do + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + - (dtdx * ( gadd(m,2,1,k,me1) & + - gadd(m,1,1,k,me1) ) & + - dtdy * hadd(m,2,0,k,me1)) & + / aux(index_capa,i,j+1,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & + + (dtdx * gadd(m,2,1,k,me1) & + + dtdy * hadd(m,2,1,k,me1)) & + / aux(index_capa,i+1,j+1,k) + end do + end do + endif + + end do + end do + + end do + end do + + !$omp flush + !$omp barrier + end do + end do + !$omp end parallel + + return +end subroutine step3 diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90 index 7d672f3b6..79d9f0d89 100644 --- a/src/pyclaw/classic/step3ds.f90 +++ b/src/pyclaw/classic/step3ds.f90 @@ -38,7 +38,7 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) ! Adding and subtracting mx is a hack to trick f2py, so it does not make nthreads optional. + dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) @@ -118,8 +118,9 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & if (iused > mwork) then ! This shouldn't happen due to checks in claw3 - write(6,*) '*** not enough work space in step3' + write(6,*) '*** not enough work space in step3ds()' write(6,*) '*** iused = ', iused, ' mwork =',mwork + write(6,*) '*** nthreads = ',nthreads stop endif From 9a871d6b7e54dbd366735053923600418aa43889 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Thu, 5 Nov 2015 09:29:33 -0800 Subject: [PATCH 06/19] Updated implementation to use 1 OpenMP thread by default and give a warning that the 'OMP_NUM_THREADS' environment varibale is unset. This allows an unaware user to run their codes without any changes, and they will see the warning and can adjust accordingly. --- src/pyclaw/classic/solver.py | 9 ++++++--- src/pyclaw/classic/step3.f90 | 5 +++++ src/pyclaw/classic/step3ds.f90 | 5 +++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 888cb181a..af9b94523 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -612,9 +612,12 @@ def __init__(self, riemann_solver=None, claw_package=None): self.work = None import os - import multiprocessing - self.nthreads = int(os.environ.get('OMP_NUM_THREADS',\ - multiprocessing.cpu_count())) + import warnings + if 'OMP_NUM_THREADS' in os.environ: + pass # Using OpenMP + else: + warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset. Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).") + self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) super(ClawSolver3D,self).__init__(riemann_solver, claw_package) diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90 index ec8b91556..89233b85c 100644 --- a/src/pyclaw/classic/step3.f90 +++ b/src/pyclaw/classic/step3.f90 @@ -40,6 +40,7 @@ subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & dimension method(7),mthlim(num_waves) dimension work(mwork) logical :: use_fwave + character :: env_value ! These block sizes must be at least 2 to avoid multiple threads ! trying to update the same grid cell @@ -61,6 +62,10 @@ subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & ! flux3 routine. Find starting index of each piece: nthreads=1 ! Serial + CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat) + IF(nstat >= 1)THEN + !$ call omp_set_num_threads(1) ! set default to 1 thread + END IF !$omp parallel !$omp single !$ nthreads = omp_get_num_threads() diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90 index 79d9f0d89..4186d86e5 100644 --- a/src/pyclaw/classic/step3ds.f90 +++ b/src/pyclaw/classic/step3ds.f90 @@ -54,6 +54,7 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & dimension method(7),mthlim(num_waves) dimension work(mwork) logical :: use_fwave + character :: env_value common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom @@ -71,6 +72,10 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & ! flux3 routine. Find starting index of each piece: nthreads=1 ! Serial + CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat) + IF(nstat >= 1)THEN + !$ call omp_set_num_threads(1) ! set default to 1 thread + END IF !$OMP PARALLEL !$OMP SINGLE !$ nthreads = omp_get_num_threads() From 44230e85a024a5a754eab25ac0c7f4102002f140 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Thu, 5 Nov 2015 09:43:21 -0800 Subject: [PATCH 07/19] small simplification to OMP_NUM_THREADS env variable check. --- src/pyclaw/classic/solver.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index af9b94523..642320912 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -613,9 +613,7 @@ def __init__(self, riemann_solver=None, claw_package=None): import os import warnings - if 'OMP_NUM_THREADS' in os.environ: - pass # Using OpenMP - else: + if 'OMP_NUM_THREADS' not in os.environ: warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset. Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).") self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) From 2d325b45f5e62c3036e24571b857f2bc6d97f2ba Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Fri, 6 Nov 2015 09:31:56 -0800 Subject: [PATCH 08/19] Updated OpenMP warning to use the PyClaw logging. This prints to the console and to the log file. --- src/pyclaw/classic/solver.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 642320912..2b91c234e 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -610,15 +610,17 @@ def __init__(self, riemann_solver=None, claw_package=None): self.aux2 = None self.aux3 = None self.work = None + self.nthreads = None + + super(ClawSolver3D,self).__init__(riemann_solver, claw_package) import os - import warnings if 'OMP_NUM_THREADS' not in os.environ: - warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset. Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).") + self.logger.warn(""""The env variable 'OMP_NUM_THREADS' is unset. + Set this variable to use OpenMP with more + than 1 thread (i.e. export OMP_NUM_THREADS=4).""") self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) - super(ClawSolver3D,self).__init__(riemann_solver, claw_package) - # ========== Setup routine ============================= def _allocate_workspace(self,solution): r""" From 66a28ee57987e783e7a4d48a55f10d4cfb5f85f2 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Fri, 16 Oct 2015 11:30:22 -0700 Subject: [PATCH 09/19] Implemented OpenMP in the PyClaw step3ds.f90 for the dimensioanlly split algorithm. The unsplit algorithm can also be updated. Usage requires setting the environment variable OMP_NUM_THREADS before program execution. In addition the gfortran flag '-fopenmp' and the f2py flag '-lgomp' are required when compiling step3ds.f90. This is done for a custom version of classic3.so, and might also be necessary for the general version. --- src/pyclaw/classic/solver.py | 37 +- src/pyclaw/classic/step3ds.f90 | 743 ++++++++++++++++++--------------- 2 files changed, 420 insertions(+), 360 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index ab7277aef..53c3a023e 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -611,6 +611,12 @@ def __init__(self, riemann_solver=None, claw_package=None): self.aux3 = None self.work = None + import os + if os.environ['OMP_NUM_THREADS'] == None: + self.nthreads = 1 + else: + self.nthreads = int(os.environ['OMP_NUM_THREADS']) + super(ClawSolver3D,self).__init__(riemann_solver, claw_package) # ========== Setup routine ============================= @@ -636,10 +642,12 @@ def _allocate_workspace(self,solution): # These work arrays really ought to live inside a fortran module # as is done for sharpclaw - self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3),order='F') - mwork = (maxm+2*num_ghost) * (31*num_eqn + num_waves + num_eqn*num_waves) + print "nthreads:",self.nthreads + self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') + mwork = (maxm+2*num_ghost)\ + *(31*num_eqn + num_waves + num_eqn*num_waves)*self.nthreads self.work = np.empty((mwork),order='F') @@ -676,17 +684,20 @@ def step_hyperbolic(self,solution): #Right now only Godunov-dimensional-splitting is implemented. #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack. - q, cfl_x = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,1,self.fwave,rpn3,rpt3,rptt3) + q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ + self.fwave,rpn3,rpt3,rptt3) - q, cfl_y = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,2,self.fwave,rpn3,rpt3,rptt3) + q, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\ + self.fwave,rpn3,rpt3,rptt3) - q, cfl_z = self.fmod.step3ds(maxm,self.num_ghost,mx,my,mz, \ - q,q,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,3,self.fwave,rpn3,rpt3,rptt3) + q, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ + self.fwave,rpn3,rpt3,rptt3) cfl = max(cfl_x,cfl_y,cfl_z) diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90 index c248a18ec..3b3ed70a6 100644 --- a/src/pyclaw/classic/step3ds.f90 +++ b/src/pyclaw/classic/step3ds.f90 @@ -1,61 +1,61 @@ +! ================================================================== +subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & + mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & + qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,& + num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3) +! ================================================================== + ! Take one time step, updating q, to be used with + ! dimensional splitting. + ! On entry, qold and qnew should be identical and give the + ! initial data for this step. + ! On exit, qnew returns values at the end of the time step. + ! qold is unchanged. + ! + ! This is a simplified version of the subroutine step3d + ! Since only qadd and fadd is used in the flux updates, + ! the solution updates below are considerably simplified + ! compared to step3d. + + !----------------------------------------------------------------- + ! NOTE! Since dimensional splitting is used, it is possible + ! to reduce the memory requirement, i.e. the size of + ! the work array. It could be reduced with + ! + ! (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux), + ! + ! when also possible reductions in flux3 are included. + ! However, this term is small compared to the dominating + ! term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost). + !----------------------------------------------------------------- + + !$ use omp_lib -! ================================================================== - subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, & - mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & - qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, & - aux1,aux2,aux3,num_aux,work,mwork,idir,use_fwave,rpn3,rpt3,rptt3) -! ================================================================== + implicit real*8(a-h,o-z) + external rpn3,rpt3,rptt3 + dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension method(7),mthlim(num_waves) + dimension work(mwork) + logical :: use_fwave -! # Take one time step, updating q, to be used with -! # dimensional splitting. -! # On entry, qold and qnew should be identical and give the -! # initial data for this step. -! # On exit, qnew returns values at the end of the time step. -! # qold is unchanged. - -! # This is a simplified version of the subroutine step3d -! # Since only qadd and fadd is used in the flux updates, -! # the solution updates below are considerably simplified -! # compared to step3d. - -! #----------------------------------------------------------------- -! # NOTE! Since dimensional splitting is used, it is possible -! # to reduce the memory requirement, i.e. the size of -! # the work array. It could be reduced with -! # -! # (maxm + 2*num_ghost)*(37*num_eqn + 6*num_aux), -! # -! # when also possible reductions in flux3 are included. -! # However, this term is small compared to the dominating -! # term (mx+2num_ghost)(my+2mb)*(mz+2num_ghost). -! #----------------------------------------------------------------- - - implicit real*8(a-h,o-z) - external rpn3,rpt3,rptt3 - dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost) - dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension dtdx1d(1-num_ghost:maxm+num_ghost) - dimension dtdy1d(1-num_ghost:maxm+num_ghost) - dimension dtdz1d(1-num_ghost:maxm+num_ghost) - dimension method(7),mthlim(num_waves) - dimension work(mwork) - logical :: use_fwave - - common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom + common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom !f2py intent(out) cfl !f2py intent(in,out) qnew @@ -67,317 +67,366 @@ subroutine step3ds(maxm,num_eqn,num_waves,num_ghost,mx,my, & !f2py x=rpt3(x) !f2py x=rptt3(x) -! # partition work array into pieces needed for local storage in -! # flux2 routine. Find starting index of each piece: + ! partition work array into pieces needed for local storage in + ! flux3 routine. Find starting index of each piece: - i0wave = 1 - i0s = i0wave + (maxm+2*num_ghost)*num_eqn*num_waves - i0amdq = i0s + (maxm+2*num_ghost)*num_waves - i0apdq = i0amdq + (maxm+2*num_ghost)*num_eqn - i0cqxx = i0apdq + (maxm+2*num_ghost)*num_eqn - i0bmamdq = i0cqxx + (maxm+2*num_ghost)*num_eqn - i0bmapdq = i0bmamdq + (maxm+2*num_ghost)*num_eqn - i0bpamdq = i0bmapdq + (maxm+2*num_ghost)*num_eqn - i0bpapdq = i0bpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq = i0bpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq = i0cmamdq + (maxm+2*num_ghost)*num_eqn - i0cpamdq = i0cmapdq + (maxm+2*num_ghost)*num_eqn - i0cpapdq = i0cpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq2 = i0cpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq2 = i0cmamdq2 + (maxm+2*num_ghost)*num_eqn - i0cpamdq2 = i0cmapdq2 + (maxm+2*num_ghost)*num_eqn - i0cpapdq2 = i0cpamdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxp = i0cpapdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxm = i0bmcqxxp + (maxm+2*num_ghost)*num_eqn - i0bpcqxxp = i0bmcqxxm + (maxm+2*num_ghost)*num_eqn - i0bpcqxxm = i0bpcqxxp + (maxm+2*num_ghost)*num_eqn - i0cmcqxxp = i0bpcqxxm + (maxm+2*num_ghost)*num_eqn - i0cmcqxxm = i0cmcqxxp + (maxm+2*num_ghost)*num_eqn - i0cpcqxxp = i0cmcqxxm + (maxm+2*num_ghost)*num_eqn - i0cpcqxxm = i0cpcqxxp + (maxm+2*num_ghost)*num_eqn - i0bmcmamdq = i0cpcqxxm + (maxm+2*num_ghost)*num_eqn - i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn - i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn - i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn - i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn - i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn - i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn - i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn - iused = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1 + nthreads=1 ! Serial + !$OMP PARALLEL + !$OMP SINGLE + !$ nthreads = omp_get_num_threads() + !$OMP END SINGLE + !$OMP END PARALLEL - if (iused > mwork) then - ! # This shouldn't happen due to checks in claw2 - write(6,*) '*** not enough work space in step2' - write(6,*) '*** iused = ', iused, ' mwork =',mwork - stop - endif + nsiz = (maxm+2*num_ghost)*num_eqn + nsiz_w = nsiz * num_waves + nsiz_s = (maxm+2*num_ghost)*num_waves - index_capa = method(6) -! num_aux = method(7) - cfl = 0.d0 - dtdx = dt/dx - dtdy = dt/dy - dtdz = dt/dz + i0wave = 1 + i0s = i0wave + nsiz_w*nthreads + i0amdq = i0s + nsiz_s*nthreads + i0apdq = i0amdq + nsiz*nthreads + i0cqxx = i0apdq + nsiz*nthreads + i0bmamdq = i0cqxx + nsiz*nthreads + i0bmapdq = i0bmamdq + nsiz*nthreads + i0bpamdq = i0bmapdq + nsiz*nthreads + i0bpapdq = i0bpamdq + nsiz*nthreads + i0cmamdq = i0bpapdq + nsiz*nthreads + i0cmapdq = i0cmamdq + nsiz*nthreads + i0cpamdq = i0cmapdq + nsiz*nthreads + i0cpapdq = i0cpamdq + nsiz*nthreads + i0cmamdq2 = i0cpapdq + nsiz*nthreads + i0cmapdq2 = i0cmamdq2 + nsiz*nthreads + i0cpamdq2 = i0cmapdq2 + nsiz*nthreads + i0cpapdq2 = i0cpamdq2 + nsiz*nthreads + i0bmcqxxp = i0cpapdq2 + nsiz*nthreads + i0bmcqxxm = i0bmcqxxp + nsiz*nthreads + i0bpcqxxp = i0bmcqxxm + nsiz*nthreads + i0bpcqxxm = i0bpcqxxp + nsiz*nthreads + i0cmcqxxp = i0bpcqxxm + nsiz*nthreads + i0cmcqxxm = i0cmcqxxp + nsiz*nthreads + i0cpcqxxp = i0cmcqxxm + nsiz*nthreads + i0cpcqxxm = i0cpcqxxp + nsiz*nthreads + i0bmcmamdq = i0cpcqxxm + nsiz*nthreads + i0bmcmapdq = i0bmcmamdq + nsiz*nthreads + i0bpcmamdq = i0bmcmapdq + nsiz*nthreads + i0bpcmapdq = i0bpcmamdq + nsiz*nthreads + i0bmcpamdq = i0bpcmapdq + nsiz*nthreads + i0bmcpapdq = i0bmcpamdq + nsiz*nthreads + i0bpcpamdq = i0bmcpapdq + nsiz*nthreads + i0bpcpapdq = i0bpcpamdq + nsiz*nthreads + iused = i0bpcpapdq + nsiz*nthreads - 1 - if (index_capa == 0) then - ! # no capa array: - do 5 i=1-num_ghost,maxm+num_ghost - dtdx1d(i) = dtdx - dtdy1d(i) = dtdy - dtdz1d(i) = dtdz - 5 END DO - endif + if (iused > mwork) then + ! This shouldn't happen due to checks in claw3 + write(6,*) '*** not enough work space in step3' + write(6,*) '*** iused = ', iused, ' mwork =',mwork + stop + endif + index_capa = method(6) + !num_aux = method(7) + cfl = 0.d0 + dtdx = dt/dx + dtdy = dt/dy + dtdz = dt/dz + + if (index_capa == 0) then + ! no capa array: + !$OMP PARALLEL PRIVATE(i,me,me1) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + do i=1-num_ghost,maxm+num_ghost + dtdx1d(i,me1) = dtdx + dtdy1d(i,me1) = dtdy + dtdz1d(i,me1) = dtdz + end do + !$OMP END PARALLEL + endif - if( idir == 1)then - - ! # perform x-sweeps - ! ================== + if( idir == 1)then - do 50 k = 0,mz+1 - do 50 j = 0,my+1 - - forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,i) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 23 i = 1-num_ghost, mx+num_ghost - dtdx1d(i) = dtdx / aux(index_capa,i,j,k) - 23 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1) - aux2(ma,i,2+ka) = aux(ma,i,j,k+ka) - end forall - endif - - ! # Store the value of j and k along this slice in the common block - ! # comxyt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - jcom = j - kcom = k - - ! # compute modifications qadd and fadd along this slice - - call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & - q1d,dtdx1d,dtdy,dtdz,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform x-sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,i,ma,ka,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + ! Guided schedule seems to produce a small performance increase + ! relative to static or dynamic for problems with uniform work + ! per column. For problems with very nonuniform work per + ! column, either guided or dynamic is necessary to keep all the + ! threads busy. + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do k = 0,mz+1 + do j = 0,my+1 + + forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,i,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do i = 1-num_ghost, mx+num_ghost + dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + if (num_aux > 0) then + forall (ma=1:num_aux,i= 1-num_ghost:mx+num_ghost, ka = -1:1) + aux2(ma,i,2+ka,me1) = aux(ma,i,j,k+ka) + end forall + endif + + ! Store the value of j and k along this slice in the common block + ! comxyt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + jcom = j + kcom = k + + ! compute modifications qadd and fadd along this slice + + call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & + q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz,dummy1,& + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),& + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # (rather than maintaining arrays f, g and h for the total fluxes, - ! # the modifications are used immediately to update qnew - ! # in order to save storage.) - - if(index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) - end forall - else - ! # with capa array - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) & - / aux(index_capa,i,j,k) - end forall - endif - - 50 END DO - - else if( idir == 2 )then + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! (rather than maintaining arrays f, g and h for the total fluxes, + ! the modifications are used immediately to update qnew + ! in order to save storage.) + + if(index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, i = 1:mx) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) + end forall + else + ! with capa array + forall (m = 1:num_eqn, i = 1:mx) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - ! # perform y sweeps - ! ================== + else if( idir == 2 )then - do 100 k = 0, mz+1 - do 100 i = 0, mx+1 - - forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,j) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 71 j = 1-num_ghost, my+num_ghost - dtdy1d(j) = dtdy / aux(index_capa,i,j,k) - 71 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - ! There's a decent chance aux2 will all fit into cache, - ! so keep accesses to aux as contiguous as possible. - forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) - aux2(ma, j, 2+ia) = aux(ma, i+ia, j, k) - end forall - endif - - ! # Store the value of i and k along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - kcom = k - - ! # compute modifications qadd and fadd along this slice - - call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & - q1d,dtdy1d,dtdz,dtdx,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform y sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,j,ma,ia,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do k = 0, mz+1 + do i = 0, mx+1 + + forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,j,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do j = 1-num_ghost, my+num_ghost + dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + + if (num_aux > 0) then + ! There's a decent chance aux2 will all fit into cache, + ! so keep accesses to aux as contiguous as possible. + forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) + aux2(ma, j, 2+ia,me1) = aux(ma, i+ia, j, k) + end forall + endif + + ! Store the value of i and k along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + kcom = k + + ! compute modifications qadd and fadd along this slice + + call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & + q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx,dummy1,& + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1),& + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the g-fluxes - - if( index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) - end forall - else - ! #with capa array. - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) & - / aux(index_capa,i,j,k) - end forall - endif - - 100 END DO - - else + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the g-fluxes + + if( index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, j = 1:my) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) + end forall + else + ! with capa array. + forall (m = 1:num_eqn, j = 1:my) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - ! # perform z sweeps - ! ================== + else - - do 150 j = 0, my+1 - do 150 i = 0, mx+1 - - forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,k) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 130 k = 1-num_ghost, mz+num_ghost - dtdz1d(k) = dtdz / aux(index_capa,i,j,k) - 130 END DO - endif - - ! # Since dimensional splitting is used, only aux2 is needed. - - if (num_aux > 0) then - ! There's a decent chance aux2 will all fit into cache, - ! so keep accesses to aux as contiguous as possible. - forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost) - aux2(ma, k, 2+ja) = aux(ma, i, j+ja, k) - end forall - endif - - ! # Store the value of i and j along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - jcom = j - - ! # compute modifications qadd and fadd along this slice - - call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & - q1d,dtdz1d,dtdx,dtdy,dummy1,aux2,dummy3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & + ! perform z sweeps + ! ================== + !$OMP PARALLEL PRIVATE(me,me1,m,k,ma,ja,cfl1d) REDUCTION(MAX:cfl) + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + cfl1d = 0.d0 + + !$OMP DO COLLAPSE(2) SCHEDULE(GUIDED) + do j = 0, my+1 + do i = 0, mx+1 + + forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) + ! copy data along a slice into 1d array: + q1d(m,k,me1) = qold(m,i,j,k) + end forall + + if (index_capa > 0) then + do k = 1-num_ghost, mz+num_ghost + dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k) + end do + endif + + ! Since dimensional splitting is used, only aux2 is needed. + + if (num_aux > 0) then + ! There's a decent chance aux2 will all fit into cache, + ! so keep accesses to aux as contiguous as possible. + forall (ma=1:num_aux,ja= -1:1, k = 1-num_ghost:mz+num_ghost) + aux2(ma, k, 2+ja,me1) = aux(ma, i, j+ja, k) + end forall + endif + + ! Store the value of i and j along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + jcom = j + + ! compute modifications qadd and fadd along this slice + + call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & + q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy,dummy1, & + aux2(1,1-num_ghost,1,me1),dummy3,num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),fadd(1,1-num_ghost,me1), & + gadd(1,1,-1,1-num_ghost,me1),hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & use_fwave,rpn3,rpt3,rptt3) - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the h-fluxes - - if(index_capa == 0)then - ! #no capa array. Standard flux differencing: - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) - end forall - else - ! # with capa array - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) & - / aux(index_capa,i,j,k) - end forall - endif - - 150 END DO + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the h-fluxes + + if(index_capa == 0)then + ! no capa array. Standard flux differencing: + forall (m = 1:num_eqn, k = 1:mz) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) + end forall + else + ! with capa array + forall (m = 1:num_eqn, k = 1:mz) + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + / aux(index_capa,i,j,k) + end forall + endif + + end do + end do + !$OMP END PARALLEL - endif + endif - return - end subroutine step3ds + return +end subroutine step3ds From 87f5f16fdd88e3e5a2033788321647c685dc5579 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Fri, 16 Oct 2015 13:48:42 -0700 Subject: [PATCH 10/19] Added F90 and f2py options for OpenMP (-fopenmp, and gomp => -lgomp) so that when compiling 'classic3.so' the openmp lines are compiled. --- src/pyclaw/classic/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py index 03b55838e..a50de681b 100644 --- a/src/pyclaw/classic/setup.py +++ b/src/pyclaw/classic/setup.py @@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None): ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet']) config.add_extension('classic3', - ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet']) + ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp']) return config From 00880f66d7fcd391c88ca0c4a7154e1b2d6a05d6 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Fri, 16 Oct 2015 14:50:03 -0700 Subject: [PATCH 11/19] Fixed bug when OMP_NUM_THREADS env variable is not set, and uses nthreads=1 --- src/pyclaw/classic/solver.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 53c3a023e..2eeb954c0 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -612,10 +612,7 @@ def __init__(self, riemann_solver=None, claw_package=None): self.work = None import os - if os.environ['OMP_NUM_THREADS'] == None: - self.nthreads = 1 - else: - self.nthreads = int(os.environ['OMP_NUM_THREADS']) + self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) super(ClawSolver3D,self).__init__(riemann_solver, claw_package) From 3dad707298d8cdc932d4b6da1df12941a0d9c1d4 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Mon, 19 Oct 2015 08:49:04 -0700 Subject: [PATCH 12/19] removed debugging print statement --- src/pyclaw/classic/solver.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 2eeb954c0..e5f07f378 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -639,7 +639,6 @@ def _allocate_workspace(self,solution): # These work arrays really ought to live inside a fortran module # as is done for sharpclaw - print "nthreads:",self.nthreads self.aux1 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') self.aux2 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') self.aux3 = np.empty((num_aux,maxm+2*num_ghost,3,self.nthreads),order='F') From 51f716194bca0593b3eee52785b2b673ed3ba535 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Wed, 28 Oct 2015 23:24:13 -0700 Subject: [PATCH 13/19] Updated solver.py to set the default number of threads to the number of CPUs when the OMP_NUM_THREADS environment varibale is unset. This is consistent with what OpenMP returns for omp_get_num_threads() when the env variable is unset. Updated step3.f90 to use OpenMP, based on what is done in clawpack/classic. This was sucessfully tested with the Sedov.py test problem in the euler_3d example folder. --- src/pyclaw/classic/solver.py | 11 +- src/pyclaw/classic/step3.f90 | 1475 +++++++++++++++++++------------- src/pyclaw/classic/step3ds.f90 | 3 +- 3 files changed, 896 insertions(+), 593 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index e5f07f378..888cb181a 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -612,7 +612,9 @@ def __init__(self, riemann_solver=None, claw_package=None): self.work = None import os - self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) + import multiprocessing + self.nthreads = int(os.environ.get('OMP_NUM_THREADS',\ + multiprocessing.cpu_count())) super(ClawSolver3D,self).__init__(riemann_solver, claw_package) @@ -699,9 +701,10 @@ def step_hyperbolic(self,solution): else: - q, cfl = self.fmod.step3(maxm,self.num_ghost,mx,my,mz, \ - qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,self._mthlim,\ - self.aux1,self.aux2,self.aux3,self.work,self.fwave,rpn3,rpt3,rptt3) + q, cfl = self.fmod.step3(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,\ + self.fwave,rpn3,rpt3,rptt3) self.cfl.update_global_max(cfl) state.set_q_from_qbc(self.num_ghost,self.qbc) diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90 index 211f626b8..ec8b91556 100644 --- a/src/pyclaw/classic/step3.f90 +++ b/src/pyclaw/classic/step3.f90 @@ -1,45 +1,50 @@ -! ================================================================== - subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, & - mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & - qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d, & - aux1,aux2,aux3,num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3) -! ================================================================== - -! # Take one time step, updating q. -! # On entry, qold and qnew should be identical and give the -! # initial data for this step -! # On exit, qnew returns values at the end of the time step. -! # qold is unchanged. - -! # qadd is used to return increments to q from flux3. -! # fadd, gadd and hadd are used to return flux increments from flux3. -! # See the flux3 documentation for more information. - - - implicit real*8(a-h,o-z) - external rpn3,rpt3,rptt3 - dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost) - dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost) - dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost) - dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & - 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) - dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3) - dimension dtdx1d(1-num_ghost:maxm+num_ghost) - dimension dtdy1d(1-num_ghost:maxm+num_ghost) - dimension dtdz1d(1-num_ghost:maxm+num_ghost) - dimension method(7),mthlim(num_waves) - dimension work(mwork) - logical :: use_fwave - +! ================================================================== +subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & + mz,qold,qnew,aux,dx,dy,dz,dt,method,mthlim,cfl, & + qadd,fadd,gadd,hadd,q1d,dtdx1d,dtdy1d,dtdz1d,aux1,aux2,aux3,& + num_aux,work,mwork,use_fwave,rpn3,rpt3,rptt3) +! ================================================================== + + ! Take one time step, updating q. + ! On entry, qold and qnew should be identical and give the + ! initial data for this step + ! On exit, qnew returns values at the end of the time step. + ! qold is unchanged. + ! + ! qadd is used to return increments to q from flux3. + ! fadd, gadd and hadd are used to return flux increments from flux3. + ! See the flux3 documentation for more information. + + !$ use omp_lib + + implicit real*8(a-h,o-z) + external rpn3,rpt3,rptt3 + dimension qold(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension qnew(num_eqn, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension q1d(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension qadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension fadd(num_eqn,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension gadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension hadd(num_eqn,2,-1:1,1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension aux(num_aux, 1-num_ghost:mx+num_ghost, & + 1-num_ghost:my+num_ghost,1-num_ghost:mz+num_ghost) + dimension aux1(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux2(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension aux3(num_aux,1-num_ghost:maxm+num_ghost,3,mx+nthreads-mx) + dimension dtdx1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdy1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension dtdz1d(1-num_ghost:maxm+num_ghost,mx+nthreads-mx) + dimension method(7),mthlim(num_waves) + dimension work(mwork) + logical :: use_fwave + + ! These block sizes must be at least 2 to avoid multiple threads + ! trying to update the same grid cell + integer, parameter :: blksiz_i = 2, blksiz_j = 2, blksiz_k = 2 + !f2py intent(out) cfl !f2py intent(in,out) qnew !f2py optional q1d, qadd, fadd, gadd, hadd, dtdx1d, dtdy1d, dtdz1d @@ -50,552 +55,846 @@ subroutine step3(maxm,num_eqn,num_waves,num_ghost,mx,my, & !f2py x=rpt3(x) !f2py x=rptt3(x) - common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom - - - -! # partition work array into pieces needed for local storage in -! # flux3 routine. Find starting index of each piece: - - i0wave = 1 - i0s = i0wave + (maxm+2*num_ghost)*num_eqn*num_waves - i0amdq = i0s + (maxm+2*num_ghost)*num_waves - i0apdq = i0amdq + (maxm+2*num_ghost)*num_eqn - i0cqxx = i0apdq + (maxm+2*num_ghost)*num_eqn - i0bmamdq = i0cqxx + (maxm+2*num_ghost)*num_eqn - i0bmapdq = i0bmamdq + (maxm+2*num_ghost)*num_eqn - i0bpamdq = i0bmapdq + (maxm+2*num_ghost)*num_eqn - i0bpapdq = i0bpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq = i0bpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq = i0cmamdq + (maxm+2*num_ghost)*num_eqn - i0cpamdq = i0cmapdq + (maxm+2*num_ghost)*num_eqn - i0cpapdq = i0cpamdq + (maxm+2*num_ghost)*num_eqn - i0cmamdq2 = i0cpapdq + (maxm+2*num_ghost)*num_eqn - i0cmapdq2 = i0cmamdq2 + (maxm+2*num_ghost)*num_eqn - i0cpamdq2 = i0cmapdq2 + (maxm+2*num_ghost)*num_eqn - i0cpapdq2 = i0cpamdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxp = i0cpapdq2 + (maxm+2*num_ghost)*num_eqn - i0bmcqxxm = i0bmcqxxp + (maxm+2*num_ghost)*num_eqn - i0bpcqxxp = i0bmcqxxm + (maxm+2*num_ghost)*num_eqn - i0bpcqxxm = i0bpcqxxp + (maxm+2*num_ghost)*num_eqn - i0cmcqxxp = i0bpcqxxm + (maxm+2*num_ghost)*num_eqn - i0cmcqxxm = i0cmcqxxp + (maxm+2*num_ghost)*num_eqn - i0cpcqxxp = i0cmcqxxm + (maxm+2*num_ghost)*num_eqn - i0cpcqxxm = i0cpcqxxp + (maxm+2*num_ghost)*num_eqn - i0bmcmamdq = i0cpcqxxm + (maxm+2*num_ghost)*num_eqn - i0bmcmapdq = i0bmcmamdq + (maxm+2*num_ghost)*num_eqn - i0bpcmamdq = i0bmcmapdq + (maxm+2*num_ghost)*num_eqn - i0bpcmapdq = i0bpcmamdq + (maxm+2*num_ghost)*num_eqn - i0bmcpamdq = i0bpcmapdq + (maxm+2*num_ghost)*num_eqn - i0bmcpapdq = i0bmcpamdq + (maxm+2*num_ghost)*num_eqn - i0bpcpamdq = i0bmcpapdq + (maxm+2*num_ghost)*num_eqn - i0bpcpapdq = i0bpcpamdq + (maxm+2*num_ghost)*num_eqn - iused = i0bpcpapdq + (maxm+2*num_ghost)*num_eqn - 1 - - if (iused > mwork) then - ! # This shouldn't happen due to checks in claw3 - write(6,*) '*** not enough work space in step3' - write(6,*) '*** iused = ', iused, ' mwork =',mwork - stop - endif - - - - index_capa = method(6) -! num_aux = method(7) - cfl = 0.d0 - dtdx = dt/dx - dtdy = dt/dy - dtdz = dt/dz - - if (index_capa == 0) then - ! # no capa array: - do 5 i=1-num_ghost,maxm+num_ghost - dtdx1d(i) = dtdx - dtdy1d(i) = dtdy - dtdz1d(i) = dtdz - 5 END DO - endif - - -! # perform x-sweeps -! ================== - - - do 50 k = 0,mz+1 - do 50 j = 0,my+1 - - forall (m = 1:num_eqn, i = 1-num_ghost:mx+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,i) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 23 i = 1-num_ghost, mx+num_ghost - dtdx1d(i) = dtdx / aux(index_capa,i,j,k) - 23 END DO - endif - - if (num_aux > 0) then - ! This odd construct may help improve cache locality. - ! (The F95 standard says each statement in the FORALL - ! must be executed for all indices before the next - ! statement is started, so this is different semantically - ! from putting all three indices in the same FORALL.) - forall (ka = -1:1) - forall (ma = 1:num_aux, i = 1-num_ghost:mx+num_ghost) - aux1(ma, i, 2+ka) = aux(ma, i, j-1, k+ka) - aux2(ma, i, 2+ka) = aux(ma, i, j, k+ka) - aux3(ma, i, 2+ka) = aux(ma, i, j+1, k+ka) - end forall - end forall - endif - - ! # Store the value of j and k along this slice in the common block - ! # comxyt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - jcom = j - kcom = k - - ! # compute modifications fadd, gadd and hadd to fluxes along - ! # this slice: - - call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & - q1d,dtdx1d,dtdy,dtdz,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # (rather than maintaining arrays f, g and h for the total fluxes, - ! # the modifications are used immediately to update qnew - ! # in order to save storage.) - - if(index_capa == 0)then - - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - dtdx * (fadd(m,i+1) - fadd(m,i)) & - - dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) & - - dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i)) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - dtdy * gadd(m,1,0,i) & - - dtdz * ( hadd(m,2,-1,i) & - - hadd(m,1,-1,i) ) - qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & - - dtdy * gadd(m,1,-1,i) & - - dtdz * hadd(m,1,-1,i) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - dtdy * ( gadd(m,2,-1,i) & - - gadd(m,1,-1,i) ) & - - dtdz * hadd(m,1,0,i) - qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & - + dtdy * gadd(m,2,-1,i) & - - dtdz * hadd(m,1,1,i) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - + dtdy * gadd(m,2,0,i) & - - dtdz * ( hadd(m,2,1,i) & - - hadd(m,1,1,i) ) - qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & - + dtdy * gadd(m,2,1,i) & - + dtdz * hadd(m,2,1,i) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - - dtdy * ( gadd(m,2,1,i) & - - gadd(m,1,1,i) ) & - + dtdz * hadd(m,2,0,i) - qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & - - dtdy * gadd(m,1,1,i) & - + dtdz * hadd(m,2,-1,i) - end forall - - else - ! # with capa array - forall (m = 1:num_eqn, i = 1:mx) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i) & - - (dtdx * (fadd(m,i+1) - fadd(m,i)) & - + dtdy * (gadd(m,2,0,i) - gadd(m,1,0,i)) & - + dtdz * (hadd(m,2,0,i) - hadd(m,1,0,i))) & - / aux(index_capa,i,j,k) - - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - (dtdy * gadd(m,1,0,i) & - + dtdz * ( hadd(m,2,-1,i) & - - hadd(m,1,-1,i) )) & - / aux(index_capa,i,j-1,k) - qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & - - (dtdy * gadd(m,1,-1,i) & - + dtdz * hadd(m,1,-1,i)) & - / aux(index_capa,i,j-1,k-1) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - (dtdy * ( gadd(m,2,-1,i) & - - gadd(m,1,-1,i) ) & - + dtdz * hadd(m,1,0,i)) & - / aux(index_capa,i,j,k-1) - qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & - + (dtdy * gadd(m,2,-1,i) & - - dtdz * hadd(m,1,1,i)) & - / aux(index_capa,i,j+1,k-1) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - + (dtdy * gadd(m,2,0,i) & - - dtdz * ( hadd(m,2,1,i) & - - hadd(m,1,1,i) )) & - / aux(index_capa,i,j+1,k) - qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & - + (dtdy * gadd(m,2,1,i) & - + dtdz * hadd(m,2,1,i)) & - / aux(index_capa,i,j+1,k+1) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - - (dtdy * ( gadd(m,2,1,i) & - - gadd(m,1,1,i) ) & - - dtdz * hadd(m,2,0,i)) & - / aux(index_capa,i,j,k+1) - qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & - - (dtdy * gadd(m,1,1,i) & - - dtdz * hadd(m,2,-1,i)) & - / aux(index_capa,i,j-1,k+1) - end forall - endif - - 50 END DO - - - -! # perform y sweeps -! ================== - - - do 100 k = 0, mz+1 - do 100 i = 0, mx+1 - - forall (m = 1:num_eqn, j = 1-num_ghost:my+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,j) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 71 j = 1-num_ghost, my+num_ghost - dtdy1d(j) = dtdy / aux(index_capa,i,j,k) - 71 END DO - endif - - if (num_aux > 0) then - ! aux1, aux2, aux3 probably fit in cache, so optimize - ! access to aux - forall (ma=1:num_aux,ia= -1:1, j = 1-num_ghost:my+num_ghost) - aux1(ma, j, 2+ia) = aux(ma, i+ia, j, k-1) - aux2(ma, j, 2+ia) = aux(ma, i+ia, j, k) - aux3(ma, j, 2+ia) = aux(ma, i+ia, j, k+1) - end forall - endif - - ! # Store the value of i and k along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - kcom = k - - ! # compute modifications fadd, gadd and hadd to fluxes along this - ! # slice: - - call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & - q1d,dtdy1d,dtdz,dtdx,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the g-fluxes - ! # gadd - modifies the h-fluxes - ! # hadd - modifies the f-fluxes - - if( index_capa == 0)then - ! # no capa array. Standard flux differencing: - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - dtdy * (fadd(m,j+1) - fadd(m,j)) & - - dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) & - - dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j)) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - + dtdz * gadd(m,2,0,j) & - - dtdx * ( hadd(m,2,1,j) & - - hadd(m,1,1,j) ) - qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & - + dtdz * gadd(m,2,1,j) & - + dtdx * hadd(m,2,1,j) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - - dtdz * ( gadd(m,2,1,j) & - - gadd(m,1,1,j) ) & - + dtdx * hadd(m,2,0,j) - qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & - - dtdz * gadd(m,1,1,j) & - + dtdx * hadd(m,2,-1,j) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - dtdz * gadd(m,1,0,j) & - - dtdx * ( hadd(m,2,-1,j) & - - hadd(m,1,-1,j) ) - qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & - - dtdz * gadd(m,1,-1,j) & - - dtdx * hadd(m,1,-1,j) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - dtdz * ( gadd(m,2,-1,j) & - - gadd(m,1,-1,j) ) & - - dtdx * hadd(m,1,0,j) - qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & - + dtdz * gadd(m,2,-1,j) & - - dtdx*hadd(m,1,1,j) - end forall - else - - ! #with capa array. - - forall (m = 1:num_eqn, j = 1:my) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j) & - - (dtdy * (fadd(m,j+1) - fadd(m,j)) & - + dtdz * (gadd(m,2,0,j) - gadd(m,1,0,j)) & - + dtdx * (hadd(m,2,0,j) - hadd(m,1,0,j))) & - / aux(index_capa,i,j,k) - qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & - + (dtdz * gadd(m,2,0,j) & - - dtdx * ( hadd(m,2,1,j) & - - hadd(m,1,1,j) )) & - / aux(index_capa,i,j,k+1) - qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & - + (dtdz * gadd(m,2,1,j) & - + dtdx * hadd(m,2,1,j)) & - / aux(index_capa,i+1,j,k+1) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - - (dtdz * ( gadd(m,2,1,j) & - - gadd(m,1,1,j) ) & - - dtdx * hadd(m,2,0,j) ) & - / aux(index_capa,i+1,j,k) - qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & - - (dtdz * gadd(m,1,1,j) & - - dtdx * hadd(m,2,-1,j)) & - / aux(index_capa,i+1,j,k-1) - qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & - - (dtdz * gadd(m,1,0,j) & - + dtdx * ( hadd(m,2,-1,j) & - - hadd(m,1,-1,j) )) & - / aux(index_capa,i,j,k-1) - qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & - - (dtdz * gadd(m,1,-1,j) & - + dtdx * hadd(m,1,-1,j)) & - / aux(index_capa,i-1,j,k-1) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - (dtdz * ( gadd(m,2,-1,j) & - - gadd(m,1,-1,j) ) & - + dtdx * hadd(m,1,0,j)) & - / aux(index_capa,i-1,j,k) - qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & - + (dtdz * gadd(m,2,-1,j) & - - dtdx*hadd(m,1,1,j)) & - / aux(index_capa,i-1,j,k+1) - end forall - endif - - 100 END DO - - - -! # perform z sweeps -! ================== - - - do 150 j = 0, my+1 - do 150 i = 0, mx+1 - - forall (m = 1:num_eqn, k = 1-num_ghost:mz+num_ghost) - ! # copy data along a slice into 1d array: - q1d(m,k) = qold(m,i,j,k) - end forall - - if (index_capa > 0) then - do 130 k = 1-num_ghost, mz+num_ghost - dtdz1d(k) = dtdz / aux(index_capa,i,j,k) - 130 END DO - endif - - if (num_aux > 0) then - ! See the comment on the X sweeps. This is semantically - ! slightly different than putting all the indices in the - ! same forall, and hopefully better for optimizing access - ! to aux. - forall (ja = -1:1, k = 1-num_ghost:mz+num_ghost) - forall (ma = 1:num_aux) - aux1(ma, k, 2+ja) = aux(ma, i-1, j+ja, k) - aux2(ma, k, 2+ja) = aux(ma, i, j+ja, k) - aux3(ma, k, 2+ja) = aux(ma, i+1, j+ja, k) - end forall - end forall - endif - - ! # Store the value of i and j along this slice in the common block - ! # comxyzt in case it is needed in the Riemann solver (for - ! # variable coefficient problems) - - icom = i - jcom = j - - ! # compute modifications fadd, gadd and hadd to fluxes along this - ! # slice: - - call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & - q1d,dtdz1d,dtdx,dtdy,aux1,aux2,aux3,num_aux, & - method,mthlim,qadd,fadd,gadd,hadd,cfl1d, & - work(i0wave),work(i0s),work(i0amdq), & - work(i0apdq),work(i0cqxx), & - work(i0bmamdq),work(i0bmapdq), & - work(i0bpamdq),work(i0bpapdq), & - work(i0cmamdq),work(i0cmapdq), & - work(i0cpamdq),work(i0cpapdq), & - work(i0cmamdq2),work(i0cmapdq2), & - work(i0cpamdq2),work(i0cpapdq2), & - work(i0bmcqxxp),work(i0bpcqxxp), & - work(i0bmcqxxm),work(i0bpcqxxm), & - work(i0cmcqxxp),work(i0cpcqxxp), & - work(i0cmcqxxm),work(i0cpcqxxm), & - work(i0bmcmamdq),work(i0bmcmapdq), & - work(i0bpcmamdq),work(i0bpcmapdq), & - work(i0bmcpamdq),work(i0bmcpapdq), & - work(i0bpcpamdq),work(i0bpcpapdq), & - use_fwave,rpn3,rpt3,rptt3) - - cfl = dmax1(cfl,cfl1d) - - ! # update qnew by flux differencing. - ! # Note that the roles of the flux updates are changed. - ! # fadd - modifies the h-fluxes - ! # gadd - modifies the f-fluxes - ! # hadd - modifies the g-fluxes - - if(index_capa == 0)then - - ! #no capa array. Standard flux differencing: - - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - dtdz * (fadd(m,k+1) - fadd(m,k)) & - - dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) & - - dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k)) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - - dtdx * ( gadd(m,2,1,k) & - - gadd(m,1,1,k) ) & - + dtdy * hadd(m,2,0,k) - qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & - + dtdx * gadd(m,2,1,k) & - + dtdy * hadd(m,2,1,k) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - + dtdx * gadd(m,2,0,k) & - - dtdy * ( hadd(m,2,1,k) & - - hadd(m,1,1,k) ) - qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & - + dtdx * gadd(m,2,-1,k) & - - dtdy * hadd(m,1,1,k) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - dtdx * ( gadd(m,2,-1,k) & - - gadd(m,1,-1,k) ) & - - dtdy * hadd(m,1,0,k) - qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & - - dtdx * gadd(m,1,-1,k) & - - dtdy * hadd(m,1,-1,k) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - dtdx * gadd(m,1,0,k) & - - dtdy * ( hadd(m,2,-1,k) & - - hadd(m,1,-1,k) ) - qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & - - dtdx * gadd(m,1,1,k) & - + dtdy * hadd(m,2,-1,k) - end forall - else - - ! # with capa array - - forall (m = 1:num_eqn, k = 1:mz) - qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k) & - - (dtdz * (fadd(m,k+1) - fadd(m,k)) & - + dtdx * (gadd(m,2,0,k) - gadd(m,1,0,k)) & - + dtdy * (hadd(m,2,0,k) - hadd(m,1,0,k))) & - / aux(index_capa,i,j,k) - qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & - - (dtdx * ( gadd(m,2,1,k) & - - gadd(m,1,1,k) ) & - - dtdy * hadd(m,2,0,k)) & - / aux(index_capa,i,j+1,k) - qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & - + (dtdx * gadd(m,2,1,k) & - + dtdy * hadd(m,2,1,k)) & - / aux(index_capa,i+1,j+1,k) - qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & - + (dtdx * gadd(m,2,0,k) & - - dtdy * ( hadd(m,2,1,k) & - - hadd(m,1,1,k) )) & - / aux(index_capa,i+1,j,k) - qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & - + (dtdx * gadd(m,2,-1,k) & - - dtdy * hadd(m,1,1,k)) & - / aux(index_capa,i+1,j-1,k) - qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & - - (dtdx * ( gadd(m,2,-1,k) & - - gadd(m,1,-1,k) ) & - + dtdy * hadd(m,1,0,k)) & - / aux(index_capa,i,j-1,k) - qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & - - (dtdx * gadd(m,1,-1,k) & - + dtdy * hadd(m,1,-1,k)) & - / aux(index_capa,i-1,j-1,k) - qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & - - (dtdx * gadd(m,1,0,k) & - + dtdy * ( hadd(m,2,-1,k) & - - hadd(m,1,-1,k) )) & - / aux(index_capa,i-1,j,k) - qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & - - (dtdx * gadd(m,1,1,k) & - - dtdy * hadd(m,2,-1,k)) & - / aux(index_capa,i-1,j+1,k) - end forall - endif - - 150 END DO - - - return - end subroutine step3 + common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom + + ! partition work array into pieces needed for local storage in + ! flux3 routine. Find starting index of each piece: + + nthreads=1 ! Serial + !$omp parallel + !$omp single + !$ nthreads = omp_get_num_threads() + !$omp end single + !$omp end parallel + + nsiz = (maxm+2*num_ghost)*num_eqn + nsiz_w = nsiz * num_waves + nsiz_s = (maxm+2*num_ghost)*num_waves + + i0wave = 1 + i0s = i0wave + nsiz_w*nthreads + i0amdq = i0s + nsiz_s*nthreads + i0apdq = i0amdq + nsiz*nthreads + i0cqxx = i0apdq + nsiz*nthreads + i0bmamdq = i0cqxx + nsiz*nthreads + i0bmapdq = i0bmamdq + nsiz*nthreads + i0bpamdq = i0bmapdq + nsiz*nthreads + i0bpapdq = i0bpamdq + nsiz*nthreads + i0cmamdq = i0bpapdq + nsiz*nthreads + i0cmapdq = i0cmamdq + nsiz*nthreads + i0cpamdq = i0cmapdq + nsiz*nthreads + i0cpapdq = i0cpamdq + nsiz*nthreads + i0cmamdq2 = i0cpapdq + nsiz*nthreads + i0cmapdq2 = i0cmamdq2 + nsiz*nthreads + i0cpamdq2 = i0cmapdq2 + nsiz*nthreads + i0cpapdq2 = i0cpamdq2 + nsiz*nthreads + i0bmcqxxp = i0cpapdq2 + nsiz*nthreads + i0bmcqxxm = i0bmcqxxp + nsiz*nthreads + i0bpcqxxp = i0bmcqxxm + nsiz*nthreads + i0bpcqxxm = i0bpcqxxp + nsiz*nthreads + i0cmcqxxp = i0bpcqxxm + nsiz*nthreads + i0cmcqxxm = i0cmcqxxp + nsiz*nthreads + i0cpcqxxp = i0cmcqxxm + nsiz*nthreads + i0cpcqxxm = i0cpcqxxp + nsiz*nthreads + i0bmcmamdq = i0cpcqxxm + nsiz*nthreads + i0bmcmapdq = i0bmcmamdq + nsiz*nthreads + i0bpcmamdq = i0bmcmapdq + nsiz*nthreads + i0bpcmapdq = i0bpcmamdq + nsiz*nthreads + i0bmcpamdq = i0bpcmapdq + nsiz*nthreads + i0bmcpapdq = i0bmcpamdq + nsiz*nthreads + i0bpcpamdq = i0bmcpapdq + nsiz*nthreads + i0bpcpapdq = i0bpcpamdq + nsiz*nthreads + iused = i0bpcpapdq + nsiz*nthreads - 1 + + if (iused > mwork) then + ! This shouldn't happen due to checks in claw3 + write(6,*) '*** not enough work space in step3()' + write(6,*) '*** iused = ', iused, ' mwork =',mwork + write(6,*) '*** nthreads = ',nthreads + stop + endif + + index_capa = method(6) + !num_aux = method(7) + cfl = 0.d0 + dtdx = dt/dx + dtdy = dt/dy + dtdz = dt/dz + + ! Calculate the number of blocks to use in each grid direction + ! (integer division, rounding up) + nblk_i = (mx + blksiz_i + 1)/blksiz_i + nblk_j = (my + blksiz_j + 1)/blksiz_j + nblk_k = (mz + blksiz_k + 1)/blksiz_k + + !$omp parallel private(me,me1,m,i,j,k,ma,ia,ja,ka,cfl1d,ioffset,joffset,koffset, & + !$omp iblk,jblk,kblk) reduction(max:cfl) + + me = 0 + !$ me = omp_get_thread_num() + me1 = me + 1 + + if (index_capa == 0) then + ! no capa array: + do i=1-num_ghost,maxm+num_ghost + dtdx1d(i,me1) = dtdx + dtdy1d(i,me1) = dtdy + dtdz1d(i,me1) = dtdz + end do + endif + + ! perform x-sweeps + ! ================== + + ! This block-based approach keeps race conditions from happening + ! with OpenMP parallelization. Because the results of the flux3 + ! call affect the column flux3 was called on and all adjacent + ! columns, an easy way to keep threads from interfering with each + ! other is to make sure that every thread takes a column that is + ! at least three cells away from all the other threads in each + ! direction. The code here accomplishes this by breaking the + ! columns into blocks; for a block size of 2 in all directions, + ! the breakdown looks like: + ! + ! xx..xx.. + ! xx..xx.. + ! **++**++ + ! **++**++ + ! xx..xx.. + ! xx..xx.. + ! **++**++ + ! **++**++ + ! + ! All the blocks of columns labeled with the same symbol (e.g. x) + ! are done in parallel in one pass; within each block, the work is + ! all done by one thread, avoiding race conditions. Then the code + ! flushes memory and synchronizes, and starts the next set of + ! blocks (e.g. *). + ! + ! Breaking up into blocks in this fashion helps provide a large + ! number of small, independent work items, which helps with + ! parallel performance. + + do koffset=0,1 + do joffset=0,1 + + ! Guided or dynamic scheduling is necessary to keep all the + ! threads busy if the work per column is very nonuniform. With + ! many blocks to process, guided is probably the best choice. + + !$omp do collapse(2) schedule(guided) + do kblk = koffset, nblk_k-1, 2 + do jblk = joffset, nblk_j-1, 2 + + do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1) + do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1) + + ! copy data along a slice into 1d array: + do i = 1-num_ghost, mx+num_ghost + do m = 1, num_eqn + q1d(m,i,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do i = 1-num_ghost, mx+num_ghost + dtdx1d(i,me1) = dtdx / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux1(ma, i, 2+ka, me1) = aux(ma, i, j-1, k+ka) + end do + end do + end do + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux2(ma, i, 2+ka, me1) = aux(ma, i, j, k+ka) + end do + end do + end do + do ka = -1, 1 + do i = 1-num_ghost, mx+num_ghost + do ma = 1, num_aux + aux3(ma, i, 2+ka, me1) = aux(ma, i, j+1, k+ka) + end do + end do + end do + endif + + ! Store the value of j and k along this slice in the common block + ! comxyt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + jcom = j + kcom = k + + ! compute modifications fadd, gadd and hadd to fluxes along + ! this slice: + + call flux3(1,maxm,num_eqn,num_waves,num_ghost,mx, & + q1d(1,1-num_ghost,me1),dtdx1d(1-num_ghost,me1),dtdy,dtdz, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),& + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),& + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),& + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! (rather than maintaining arrays f, g and h for + ! the total fluxes, the modifications are used immediately + ! to update qnew in order to save storage.) + + if(index_capa == 0)then + ! Order so that access to qnew is in memory-contiguous + ! order as much as possible + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & + - dtdy * gadd(m,1,-1,i,me1) & + - dtdz * hadd(m,1,-1,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - dtdy * ( gadd(m,2,-1,i,me1) & + - gadd(m,1,-1,i,me1) ) & + - dtdz * hadd(m,1,0,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & + + dtdy * gadd(m,2,-1,i,me1) & + - dtdz * hadd(m,1,1,i,me1) + end do + end do + + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - dtdy * gadd(m,1,0,i,me1) & + - dtdz * ( hadd(m,2,-1,i,me1) & + - hadd(m,1,-1,i,me1) ) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + - dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) & + - dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1)) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + + dtdy * gadd(m,2,0,i,me1) & + - dtdz * ( hadd(m,2,1,i,me1) & + - hadd(m,1,1,i,me1) ) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & + - dtdy * gadd(m,1,1,i,me1) & + + dtdz * hadd(m,2,-1,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + - dtdy * ( gadd(m,2,1,i,me1) & + - gadd(m,1,1,i,me1) ) & + + dtdz * hadd(m,2,0,i,me1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & + + dtdy * gadd(m,2,1,i,me1) & + + dtdz * hadd(m,2,1,i,me1) + end do + end do + else + ! with capa array + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k-1) = qnew(m,i,j-1,k-1) & + - (dtdy * gadd(m,1,-1,i,me1) & + + dtdz * hadd(m,1,-1,i,me1)) & + / aux(index_capa,i,j-1,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - (dtdy * ( gadd(m,2,-1,i,me1) & + - gadd(m,1,-1,i,me1) ) & + + dtdz * hadd(m,1,0,i,me1)) & + / aux(index_capa,i,j,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k-1) = qnew(m,i,j+1,k-1) & + + (dtdy * gadd(m,2,-1,i,me1) & + - dtdz * hadd(m,1,1,i,me1)) & + / aux(index_capa,i,j+1,k-1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - (dtdy * gadd(m,1,0,i,me1) & + + dtdz * ( hadd(m,2,-1,i,me1) & + - hadd(m,1,-1,i,me1) )) & + / aux(index_capa,i,j-1,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,i,me1) & + - (dtdx * (fadd(m,i+1,me1) - fadd(m,i,me1)) & + + dtdy * (gadd(m,2,0,i,me1) - gadd(m,1,0,i,me1)) & + + dtdz * (hadd(m,2,0,i,me1) - hadd(m,1,0,i,me1)))& + / aux(index_capa,i,j,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + + (dtdy * gadd(m,2,0,i,me1) & + - dtdz * ( hadd(m,2,1,i,me1) & + - hadd(m,1,1,i,me1) )) & + / aux(index_capa,i,j+1,k) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j-1,k+1) = qnew(m,i,j-1,k+1) & + - (dtdy * gadd(m,1,1,i,me1) & + - dtdz * hadd(m,2,-1,i,me1)) & + / aux(index_capa,i,j-1,k+1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + - (dtdy * ( gadd(m,2,1,i,me1) & + - gadd(m,1,1,i,me1) ) & + - dtdz * hadd(m,2,0,i,me1)) & + / aux(index_capa,i,j,k+1) + end do + end do + do i = 1, mx + do m = 1, num_eqn + qnew(m,i,j+1,k+1) = qnew(m,i,j+1,k+1) & + + (dtdy * gadd(m,2,1,i,me1) & + + dtdz * hadd(m,2,1,i,me1)) & + / aux(index_capa,i,j+1,k+1) + end do + end do + endif + + end do + end do ! End loops within block + + end do + end do ! End loops over blocks + + ! Flush memory (may not be necessary), then make sure everybody + ! synchronizes before the next offset + + !$omp flush + !$omp barrier + end do + end do ! End loops over offsets + + ! perform y sweeps + ! ================== + + do koffset=0,1 + do ioffset=0,1 + + !$omp do collapse(2) schedule(guided) + do kblk = koffset, nblk_k-1, 2 + do iblk = ioffset, nblk_i-1, 2 + + do k = kblk*blksiz_k, min((kblk+1)*blksiz_k-1, mz+1) + do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1) + + ! copy data along a slice into 1d array: + do j = 1-num_ghost, my+num_ghost + do m = 1, num_eqn + q1d(m,j,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do j = 1-num_ghost, my+num_ghost + dtdy1d(j,me1) = dtdy / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + ! aux1, aux2, aux3 probably fit in cache, so optimize + ! access to aux + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux1(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k-1) + end do + end do + end do + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux2(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k) + end do + end do + end do + do j = 1-num_ghost, my+num_ghost + do ia = -1, 1 + do ma = 1, num_aux + aux3(ma, j, 2+ia, me1) = aux(ma, i+ia, j, k+1) + end do + end do + end do + endif + + ! Store the value of i and k along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + kcom = k + + ! compute modifications fadd, gadd and hadd to fluxes along this + ! slice: + + call flux3(2,maxm,num_eqn,num_waves,num_ghost,my, & + q1d(1,1-num_ghost,me1),dtdy1d(1-num_ghost,me1),dtdz,dtdx, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1),& + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1),& + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s),& + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the g-fluxes + ! gadd - modifies the h-fluxes + ! hadd - modifies the f-fluxes + + if( index_capa == 0)then + ! no capa array. Standard flux differencing: + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & + - dtdz * gadd(m,1,-1,j,me1) & + - dtdx * hadd(m,1,-1,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - dtdz * gadd(m,1,0,j,me1) & + - dtdx * ( hadd(m,2,-1,j,me1) & + - hadd(m,1,-1,j,me1) ) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & + - dtdz * gadd(m,1,1,j,me1) & + + dtdx * hadd(m,2,-1,j,me1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - dtdz * ( gadd(m,2,-1,j,me1) & + - gadd(m,1,-1,j,me1) ) & + - dtdx * hadd(m,1,0,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + - dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) & + - dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1)) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + - dtdz * ( gadd(m,2,1,j,me1) & + - gadd(m,1,1,j,me1) ) & + + dtdx * hadd(m,2,0,j,me1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & + + dtdz * gadd(m,2,-1,j,me1) & + - dtdx*hadd(m,1,1,j,me1) + end do + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + + dtdz * gadd(m,2,0,j,me1) & + - dtdx * ( hadd(m,2,1,j,me1) & + - hadd(m,1,1,j,me1) ) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & + + dtdz * gadd(m,2,1,j,me1) & + + dtdx * hadd(m,2,1,j,me1) + end do + end do + else + ! with capa array. + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k-1) = qnew(m,i-1,j,k-1) & + - (dtdz * gadd(m,1,-1,j,me1) & + + dtdx * hadd(m,1,-1,j,me1)) & + / aux(index_capa,i-1,j,k-1) + end do + do m = 1, num_eqn + qnew(m,i,j,k-1) = qnew(m,i,j,k-1) & + - (dtdz * gadd(m,1,0,j,me1) & + + dtdx * ( hadd(m,2,-1,j,me1) & + - hadd(m,1,-1,j,me1) )) & + / aux(index_capa,i,j,k-1) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k-1) = qnew(m,i+1,j,k-1) & + - (dtdz * gadd(m,1,1,j,me1) & + - dtdx * hadd(m,2,-1,j,me1)) & + / aux(index_capa,i+1,j,k-1) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - (dtdz * ( gadd(m,2,-1,j,me1) & + - gadd(m,1,-1,j,me1) ) & + + dtdx * hadd(m,1,0,j,me1)) & + / aux(index_capa,i-1,j,k) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,j,me1) & + - (dtdy * (fadd(m,j+1,me1) - fadd(m,j,me1)) & + + dtdz * (gadd(m,2,0,j,me1) - gadd(m,1,0,j,me1)) & + + dtdx * (hadd(m,2,0,j,me1) - hadd(m,1,0,j,me1)))& + / aux(index_capa,i,j,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + - (dtdz * ( gadd(m,2,1,j,me1) & + - gadd(m,1,1,j,me1) ) & + - dtdx * hadd(m,2,0,j,me1) ) & + / aux(index_capa,i+1,j,k) + end do + end do + do j = 1, my + do m = 1, num_eqn + qnew(m,i-1,j,k+1) = qnew(m,i-1,j,k+1) & + + (dtdz * gadd(m,2,-1,j,me1) & + - dtdx*hadd(m,1,1,j,me1)) & + / aux(index_capa,i-1,j,k+1) + end do + do m = 1, num_eqn + qnew(m,i,j,k+1) = qnew(m,i,j,k+1) & + + (dtdz * gadd(m,2,0,j,me1) & + - dtdx * ( hadd(m,2,1,j,me1) & + - hadd(m,1,1,j,me1) )) & + / aux(index_capa,i,j,k+1) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k+1) = qnew(m,i+1,j,k+1) & + + (dtdz * gadd(m,2,1,j,me1) & + + dtdx * hadd(m,2,1,j,me1)) & + / aux(index_capa,i+1,j,k+1) + end do + end do + endif + + end do + end do ! End loops within blocks + + end do + end do ! End loops over blocks + + !$omp flush + !$omp barrier + end do + end do ! End loops over offsets + + ! perform z sweeps + ! ================== + + do joffset=0,1 + do ioffset=0,1 + + !$omp do collapse(2) schedule(guided) + do jblk = joffset, nblk_j-1, 2 + do iblk = ioffset, nblk_i-1, 2 + + do j = jblk*blksiz_j, min((jblk+1)*blksiz_j-1, my+1) + do i = iblk*blksiz_i, min((iblk+1)*blksiz_i-1, mx+1) + + do k = 1-num_ghost, mz+num_ghost + do m = 1, num_eqn + q1d(m,k,me1) = qold(m,i,j,k) + end do + end do + + if (index_capa > 0) then + do k = 1-num_ghost, mz+num_ghost + dtdz1d(k,me1) = dtdz / aux(index_capa,i,j,k) + end do + endif + + if (num_aux > 0) then + do k = 1-num_ghost, mz+num_ghost + do ja = -1, 1 + do ma = 1, num_aux + aux1(ma, k, 2+ja, me1) = aux(ma, i-1, j+ja, k) + end do + do ma = 1, num_aux + aux2(ma, k, 2+ja, me1) = aux(ma, i, j+ja, k) + end do + do ma = 1, num_aux + aux3(ma, k, 2+ja, me1) = aux(ma, i+1, j+ja, k) + end do + end do + end do + endif + + + + ! Store the value of i and j along this slice in the common block + ! comxyzt in case it is needed in the Riemann solver (for + ! variable coefficient problems) + + icom = i + jcom = j + + ! compute modifications fadd, gadd and hadd to fluxes along this + ! slice: + + call flux3(3,maxm,num_eqn,num_waves,num_ghost,mz, & + q1d(1,1-num_ghost,me1),dtdz1d(1-num_ghost,me1),dtdx,dtdy, & + aux1(1,1-num_ghost,1,me1),aux2(1,1-num_ghost,1,me1), & + aux3(1,1-num_ghost,1,me1),num_aux, & + method,mthlim,qadd(1,1-num_ghost,me1), & + fadd(1,1-num_ghost,me1),gadd(1,1,-1,1-num_ghost,me1), & + hadd(1,1,-1,1-num_ghost,me1),cfl1d, & + work(i0wave+me*nsiz_w),work(i0s+me*nsiz_s), & + work(i0amdq+me*nsiz), & + work(i0apdq+me*nsiz),work(i0cqxx+me*nsiz), & + work(i0bmamdq+me*nsiz),work(i0bmapdq+me*nsiz), & + work(i0bpamdq+me*nsiz),work(i0bpapdq+me*nsiz), & + work(i0cmamdq+me*nsiz),work(i0cmapdq+me*nsiz), & + work(i0cpamdq+me*nsiz),work(i0cpapdq+me*nsiz), & + work(i0cmamdq2+me*nsiz),work(i0cmapdq2+me*nsiz), & + work(i0cpamdq2+me*nsiz),work(i0cpapdq2+me*nsiz), & + work(i0bmcqxxp+me*nsiz),work(i0bpcqxxp+me*nsiz), & + work(i0bmcqxxm+me*nsiz),work(i0bpcqxxm+me*nsiz), & + work(i0cmcqxxp+me*nsiz),work(i0cpcqxxp+me*nsiz), & + work(i0cmcqxxm+me*nsiz),work(i0cpcqxxm+me*nsiz), & + work(i0bmcmamdq+me*nsiz),work(i0bmcmapdq+me*nsiz), & + work(i0bpcmamdq+me*nsiz),work(i0bpcmapdq+me*nsiz), & + work(i0bmcpamdq+me*nsiz),work(i0bmcpapdq+me*nsiz), & + work(i0bpcpamdq+me*nsiz),work(i0bpcpapdq+me*nsiz), & + use_fwave,rpn3,rpt3,rptt3) + + cfl = dmax1(cfl,cfl1d) + + ! update qnew by flux differencing. + ! Note that the roles of the flux updates are changed. + ! fadd - modifies the h-fluxes + ! gadd - modifies the f-fluxes + ! hadd - modifies the g-fluxes + + if(index_capa == 0)then + + ! no capa array. Standard flux differencing: + do k = 1, mz + do m = 1, num_eqn + qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & + - dtdx * gadd(m,1,-1,k,me1) & + - dtdy * hadd(m,1,-1,k,me1) + end do + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - dtdx * ( gadd(m,2,-1,k,me1) & + - gadd(m,1,-1,k,me1) ) & + - dtdy * hadd(m,1,0,k,me1) + end do + do m = 1, num_eqn + qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & + + dtdx * gadd(m,2,-1,k,me1) & + - dtdy * hadd(m,1,1,k,me1) + end do + + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - dtdx * gadd(m,1,0,k,me1) & + - dtdy * ( hadd(m,2,-1,k,me1) & + - hadd(m,1,-1,k,me1) ) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + - dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) & + - dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1)) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + + dtdx * gadd(m,2,0,k,me1) & + - dtdy * ( hadd(m,2,1,k,me1) & + - hadd(m,1,1,k,me1) ) + end do + + do m = 1, num_eqn + qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & + - dtdx * gadd(m,1,1,k,me1) & + + dtdy * hadd(m,2,-1,k,me1) + end do + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + - dtdx * ( gadd(m,2,1,k,me1) & + - gadd(m,1,1,k,me1) ) & + + dtdy * hadd(m,2,0,k,me1) + end do + do m = 1, num_eqn + qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & + + dtdx * gadd(m,2,1,k,me1) & + + dtdy * hadd(m,2,1,k,me1) + end do + end do + else + + ! with capa array + do k = 1, mz + do m = 1, num_eqn + qnew(m,i-1,j-1,k) = qnew(m,i-1,j-1,k) & + - (dtdx * gadd(m,1,-1,k,me1) & + + dtdy * hadd(m,1,-1,k,me1)) & + / aux(index_capa,i-1,j-1,k) + end do + do m = 1, num_eqn + qnew(m,i,j-1,k) = qnew(m,i,j-1,k) & + - (dtdx * ( gadd(m,2,-1,k,me1) & + - gadd(m,1,-1,k,me1) ) & + + dtdy * hadd(m,1,0,k,me1)) & + / aux(index_capa,i,j-1,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j-1,k) = qnew(m,i+1,j-1,k) & + + (dtdx * gadd(m,2,-1,k,me1) & + - dtdy * hadd(m,1,1,k,me1)) & + / aux(index_capa,i+1,j-1,k) + end do + + do m = 1, num_eqn + qnew(m,i-1,j,k) = qnew(m,i-1,j,k) & + - (dtdx * gadd(m,1,0,k,me1) & + + dtdy * ( hadd(m,2,-1,k,me1) & + - hadd(m,1,-1,k,me1) )) & + / aux(index_capa,i-1,j,k) + end do + do m = 1, num_eqn + qnew(m,i,j,k) = qnew(m,i,j,k) + qadd(m,k,me1) & + - (dtdz * (fadd(m,k+1,me1) - fadd(m,k,me1)) & + + dtdx * (gadd(m,2,0,k,me1) - gadd(m,1,0,k,me1)) & + + dtdy * (hadd(m,2,0,k,me1) - hadd(m,1,0,k,me1)))& + / aux(index_capa,i,j,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j,k) = qnew(m,i+1,j,k) & + + (dtdx * gadd(m,2,0,k,me1) & + - dtdy * ( hadd(m,2,1,k,me1) & + - hadd(m,1,1,k,me1) )) & + / aux(index_capa,i+1,j,k) + end do + + do m = 1, num_eqn + qnew(m,i-1,j+1,k) = qnew(m,i-1,j+1,k) & + - (dtdx * gadd(m,1,1,k,me1) & + - dtdy * hadd(m,2,-1,k,me1)) & + / aux(index_capa,i-1,j+1,k) + end do + do m = 1, num_eqn + qnew(m,i,j+1,k) = qnew(m,i,j+1,k) & + - (dtdx * ( gadd(m,2,1,k,me1) & + - gadd(m,1,1,k,me1) ) & + - dtdy * hadd(m,2,0,k,me1)) & + / aux(index_capa,i,j+1,k) + end do + do m = 1, num_eqn + qnew(m,i+1,j+1,k) = qnew(m,i+1,j+1,k) & + + (dtdx * gadd(m,2,1,k,me1) & + + dtdy * hadd(m,2,1,k,me1)) & + / aux(index_capa,i+1,j+1,k) + end do + end do + endif + + end do + end do + + end do + end do + + !$omp flush + !$omp barrier + end do + end do + !$omp end parallel + + return +end subroutine step3 diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90 index 3b3ed70a6..79d9f0d89 100644 --- a/src/pyclaw/classic/step3ds.f90 +++ b/src/pyclaw/classic/step3ds.f90 @@ -118,8 +118,9 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & if (iused > mwork) then ! This shouldn't happen due to checks in claw3 - write(6,*) '*** not enough work space in step3' + write(6,*) '*** not enough work space in step3ds()' write(6,*) '*** iused = ', iused, ' mwork =',mwork + write(6,*) '*** nthreads = ',nthreads stop endif From ba15781dfccec35f9361140f5fee42f818aea756 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Thu, 5 Nov 2015 09:28:07 -0800 Subject: [PATCH 14/19] Updated implementation to use 1 OpenMP thread by default and give a warning that the 'OMP_NUM_THREADS' environment varibale is unset. This allows an unaware user to run their codes without any changes, and they will see the warning and can adjust accordingly. --- src/pyclaw/classic/solver.py | 9 ++++++--- src/pyclaw/classic/step3.f90 | 5 +++++ src/pyclaw/classic/step3ds.f90 | 5 +++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 888cb181a..af9b94523 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -612,9 +612,12 @@ def __init__(self, riemann_solver=None, claw_package=None): self.work = None import os - import multiprocessing - self.nthreads = int(os.environ.get('OMP_NUM_THREADS',\ - multiprocessing.cpu_count())) + import warnings + if 'OMP_NUM_THREADS' in os.environ: + pass # Using OpenMP + else: + warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset. Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).") + self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) super(ClawSolver3D,self).__init__(riemann_solver, claw_package) diff --git a/src/pyclaw/classic/step3.f90 b/src/pyclaw/classic/step3.f90 index ec8b91556..89233b85c 100644 --- a/src/pyclaw/classic/step3.f90 +++ b/src/pyclaw/classic/step3.f90 @@ -40,6 +40,7 @@ subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & dimension method(7),mthlim(num_waves) dimension work(mwork) logical :: use_fwave + character :: env_value ! These block sizes must be at least 2 to avoid multiple threads ! trying to update the same grid cell @@ -61,6 +62,10 @@ subroutine step3(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & ! flux3 routine. Find starting index of each piece: nthreads=1 ! Serial + CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat) + IF(nstat >= 1)THEN + !$ call omp_set_num_threads(1) ! set default to 1 thread + END IF !$omp parallel !$omp single !$ nthreads = omp_get_num_threads() diff --git a/src/pyclaw/classic/step3ds.f90 b/src/pyclaw/classic/step3ds.f90 index 79d9f0d89..4186d86e5 100644 --- a/src/pyclaw/classic/step3ds.f90 +++ b/src/pyclaw/classic/step3ds.f90 @@ -54,6 +54,7 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & dimension method(7),mthlim(num_waves) dimension work(mwork) logical :: use_fwave + character :: env_value common /comxyzt/ dtcom,dxcom,dycom,dzcom,tcom,icom,jcom,kcom @@ -71,6 +72,10 @@ subroutine step3ds(maxm,nthreads,num_eqn,num_waves,num_ghost,mx,my, & ! flux3 routine. Find starting index of each piece: nthreads=1 ! Serial + CALL GET_ENVIRONMENT_VARIABLE('OMP_NUM_THREADS',env_value,nlen,nstat) + IF(nstat >= 1)THEN + !$ call omp_set_num_threads(1) ! set default to 1 thread + END IF !$OMP PARALLEL !$OMP SINGLE !$ nthreads = omp_get_num_threads() From 55c4a3d48c8f2b22e02efd4ef41b97a92c20ce09 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Thu, 5 Nov 2015 09:52:37 -0800 Subject: [PATCH 15/19] small simplification to OMP_NUM_THREADS env variable check. --- src/pyclaw/classic/solver.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index af9b94523..642320912 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -613,9 +613,7 @@ def __init__(self, riemann_solver=None, claw_package=None): import os import warnings - if 'OMP_NUM_THREADS' in os.environ: - pass # Using OpenMP - else: + if 'OMP_NUM_THREADS' not in os.environ: warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset. Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).") self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) From 92158db1bcede02fc25b5e5354aa15f425c45171 Mon Sep 17 00:00:00 2001 From: Wes Lowrie Date: Fri, 6 Nov 2015 09:31:15 -0800 Subject: [PATCH 16/19] Updated OpenMP warning to use the PyClaw logging. This prints to the console and to the log file. --- src/pyclaw/classic/solver.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 642320912..2b91c234e 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -610,15 +610,17 @@ def __init__(self, riemann_solver=None, claw_package=None): self.aux2 = None self.aux3 = None self.work = None + self.nthreads = None + + super(ClawSolver3D,self).__init__(riemann_solver, claw_package) import os - import warnings if 'OMP_NUM_THREADS' not in os.environ: - warnings.warn("The environment variable 'OMP_NUM_THREADS' is unset. Set this variable to use OpenMP with more than 1 thread (i.e. export OMP_NUM_THREADS=4).") + self.logger.warn(""""The env variable 'OMP_NUM_THREADS' is unset. + Set this variable to use OpenMP with more + than 1 thread (i.e. export OMP_NUM_THREADS=4).""") self.nthreads = int(os.environ.get('OMP_NUM_THREADS',1)) - super(ClawSolver3D,self).__init__(riemann_solver, claw_package) - # ========== Setup routine ============================= def _allocate_workspace(self,solution): r""" From 0b595de01b554af20f52c18a0f3873435c33f42e Mon Sep 17 00:00:00 2001 From: Wes Lowrie ARA/SED Date: Wed, 22 Nov 2017 16:30:03 -0800 Subject: [PATCH 17/19] Various changes to pyclaw. Updates to properly handle restarts using hdf5 --- examples/euler_1d/shocksine.py | 7 ++++++- src/pyclaw/classic/setup.py | 2 +- src/pyclaw/classic/solver.py | 25 +++++++++++++++++++++++-- src/pyclaw/io/hdf5.py | 4 ++-- src/pyclaw/sharpclaw/solver.py | 21 +++++++++++++++++---- src/pyclaw/solver.py | 21 ++++++++++++++++++++- src/pyclaw/state.py | 2 +- 7 files changed, 70 insertions(+), 12 deletions(-) diff --git a/examples/euler_1d/shocksine.py b/examples/euler_1d/shocksine.py index 57d9229c8..b770f3ec9 100755 --- a/examples/euler_1d/shocksine.py +++ b/examples/euler_1d/shocksine.py @@ -52,10 +52,15 @@ def setup(use_petsc=False,iplot=False,htmlplot=False,outdir='./_output',solver_t if solver_type=='sharpclaw': solver = pyclaw.SharpClawSolver1D(rs) - solver.time_integrator = 'RK' + solver.time_integrator = 'Euler' solver.a, solver.b, solver.c = a, b, c solver.cfl_desired = 0.6 solver.cfl_max = 0.7 + + solver.weno_order = 2 + solver.num_ghost = 2 + solver.lim_type = 1 + if use_char_decomp: try: import sharpclaw1 # Import custom Fortran code diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py index a50de681b..d5ff10de7 100644 --- a/src/pyclaw/classic/setup.py +++ b/src/pyclaw/classic/setup.py @@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None): ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet']) config.add_extension('classic3', - ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp']) + ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp -Warray-bounds -fcheck=all'],libraries=['gomp']) return config diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 2b91c234e..57fdf6474 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -141,7 +141,7 @@ def step(self,solution,take_one_step,tstart,tend): # Godunov Splitting if self.source_split == 1: self.step_source(self,solution.states[0],self.dt) - + return True def _check_cfl_settings(self): @@ -684,7 +684,7 @@ def step_hyperbolic(self,solution): if self.dimensional_split: #Right now only Godunov-dimensional-splitting is implemented. #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack. - + #""" q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ @@ -699,6 +699,27 @@ def step_hyperbolic(self,solution): mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\ self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ self.fwave,rpn3,rpt3,rptt3) + #""" + + """ + #print "No Dim Splitting..." + # No Dimensional Splitting, Similar to SharpClaw + dq = qnew.copy('F') + dq[...] = 0. + dq, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ + self.fwave,rpn3,rpt3,rptt3) + dq, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\ + self.fwave,rpn3,rpt3,rptt3) + dq, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ + self.fwave,rpn3,rpt3,rptt3) + self.qbc += dq + """ cfl = max(cfl_x,cfl_y,cfl_z) diff --git a/src/pyclaw/io/hdf5.py b/src/pyclaw/io/hdf5.py index 390e5e4ee..8dbde118a 100644 --- a/src/pyclaw/io/hdf5.py +++ b/src/pyclaw/io/hdf5.py @@ -193,11 +193,11 @@ def read(solution,frame,path='./',file_prefix='claw',read_aux=True, state = pyclaw.state.State(pyclaw_patch, \ patch.attrs['num_eqn'],patch.attrs['num_aux']) state.t = patch.attrs['t'] - state.q = patch['q'][:].reshape(state.q.shape,order='F') + state.q = patch['q'][:].ravel(order='F').reshape(state.q.shape,order='F') # Read in aux if applicable if read_aux and patch.get('aux',None) is not None: - state.aux = patch['aux'][:].reshape(state.aux.shape,order='F') + state.aux = patch['aux'][:].ravel(order='F').reshape(state.aux.shape,order='F') solution.states.append(state) patches.append(pyclaw_patch) diff --git a/src/pyclaw/sharpclaw/solver.py b/src/pyclaw/sharpclaw/solver.py index 3a42282b7..4158d5861 100644 --- a/src/pyclaw/sharpclaw/solver.py +++ b/src/pyclaw/sharpclaw/solver.py @@ -284,6 +284,7 @@ def step(self,solution,take_one_step,tstart,tend): Take one step with a Runge-Kutta or multistep method as specified by `solver.time_integrator`. """ + state = solution.states[0] step_index = self.status['numsteps'] + 1 if self.accept_step == True: @@ -515,9 +516,19 @@ def check_3rd_ord_cond(self,state,step_index,dtFE): def _set_mthlim(self): self._mthlim = self.limiters if not isinstance(self.limiters,list): self._mthlim=[self._mthlim] - if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves - if len(self._mthlim)!=self.num_waves: - raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves') + if self.lim_type==1: + if self.char_decomp == 0 or self.char_decomp == 3: + if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_eqn + if len(self._mthlim)!=self.num_eqn: + raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_eqn') + elif self.char_decomp == 1 or self.char_decomp == 4: + if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves + if len(self._mthlim)!=self.num_waves: + raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves') + else: + if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves + if len(self._mthlim)!=self.num_waves: + raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves') def dq(self,state): @@ -526,7 +537,9 @@ def dq(self,state): """ deltaq = self.dq_hyperbolic(state) - + + #state.q += deltaq + #deltaq[...] = 0. if self.dq_src is not None: deltaq+=self.dq_src(self,state,self.dt) diff --git a/src/pyclaw/solver.py b/src/pyclaw/solver.py index 5bc708c6a..e73209f3b 100644 --- a/src/pyclaw/solver.py +++ b/src/pyclaw/solver.py @@ -32,6 +32,14 @@ def before_step(solver,solution): """ pass +def after_step(solver,solution): + r""" + Dummy routine called after each step + + Replace this routine if you want to do something after each time step. + """ + pass + class Solver(object): r""" Pyclaw solver superclass. @@ -85,6 +93,13 @@ class Solver(object): def before_step(solver,solution) + .. attribute:: after_step + + Function called after each time step is taken. + The required signature for this function is: + + def after_step(solver,solution) + .. attribute:: dt_variable Whether to allow the time step to vary, ``default = True``. @@ -179,6 +194,7 @@ def __init__(self,riemann_solver=None,claw_package=None): self._use_old_bc_sig = False self.accept_step = True self.before_step = None + self.after_step = None # select package to build solver objects from, by default this will be # the package that contains the module implementing the derived class @@ -610,6 +626,9 @@ def evolve_to_time(self,solution,tend=None): # Note that the solver may alter dt during the step() routine self.step(solution,take_one_step,tstart,tend) + if self.after_step is not None: + self.after_step(self,solution.states[0]) + # Check to make sure that the Courant number was not too large cfl = self.cfl.get_cached_max() self.accept_step = self.accept_reject_step(state) @@ -642,7 +661,7 @@ def evolve_to_time(self,solution,tend=None): self.status['cflmax'] = \ max(cfl, self.status['cflmax']) raise Exception('CFL too large, giving up!') - + # See if we are finished yet if solution.t >= tend or take_one_step: break diff --git a/src/pyclaw/state.py b/src/pyclaw/state.py index c47dbc4d1..9851fd657 100755 --- a/src/pyclaw/state.py +++ b/src/pyclaw/state.py @@ -189,7 +189,7 @@ def is_valid(self): valid = False if self.aux is not None: if not self.aux.flags['F_CONTIGUOUS']: - logger.debug('q array is not Fortran contiguous.') + logger.debug('aux array is not Fortran contiguous.') valid = False return valid From cb6a50daf43fc6a483e5272c0b74007131a17ca8 Mon Sep 17 00:00:00 2001 From: Wes Lowrie ARA/SED Date: Mon, 27 Nov 2017 09:47:36 -0800 Subject: [PATCH 18/19] Revert "Various changes to pyclaw. Updates to properly handle restarts using hdf5" This reverts commit 0b595de01b554af20f52c18a0f3873435c33f42e. --- examples/euler_1d/shocksine.py | 7 +------ src/pyclaw/classic/setup.py | 2 +- src/pyclaw/classic/solver.py | 25 ++----------------------- src/pyclaw/io/hdf5.py | 4 ++-- src/pyclaw/sharpclaw/solver.py | 21 ++++----------------- src/pyclaw/solver.py | 21 +-------------------- src/pyclaw/state.py | 2 +- 7 files changed, 12 insertions(+), 70 deletions(-) diff --git a/examples/euler_1d/shocksine.py b/examples/euler_1d/shocksine.py index b770f3ec9..57d9229c8 100755 --- a/examples/euler_1d/shocksine.py +++ b/examples/euler_1d/shocksine.py @@ -52,15 +52,10 @@ def setup(use_petsc=False,iplot=False,htmlplot=False,outdir='./_output',solver_t if solver_type=='sharpclaw': solver = pyclaw.SharpClawSolver1D(rs) - solver.time_integrator = 'Euler' + solver.time_integrator = 'RK' solver.a, solver.b, solver.c = a, b, c solver.cfl_desired = 0.6 solver.cfl_max = 0.7 - - solver.weno_order = 2 - solver.num_ghost = 2 - solver.lim_type = 1 - if use_char_decomp: try: import sharpclaw1 # Import custom Fortran code diff --git a/src/pyclaw/classic/setup.py b/src/pyclaw/classic/setup.py index d5ff10de7..a50de681b 100644 --- a/src/pyclaw/classic/setup.py +++ b/src/pyclaw/classic/setup.py @@ -13,7 +13,7 @@ def configuration(parent_package='',top_path=None): ['limiter.f90','philim.f90','flux2.f90','step2ds.f90','step2.f90'],f2py_options=['--quiet']) config.add_extension('classic3', - ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp -Warray-bounds -fcheck=all'],libraries=['gomp']) + ['limiter.f90','philim.f90','flux3.f90','step3ds.f90','step3.f90'],f2py_options=['--quiet'],extra_f90_compile_args=['-fopenmp'],libraries=['gomp']) return config diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 57fdf6474..2b91c234e 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -141,7 +141,7 @@ def step(self,solution,take_one_step,tstart,tend): # Godunov Splitting if self.source_split == 1: self.step_source(self,solution.states[0],self.dt) - + return True def _check_cfl_settings(self): @@ -684,7 +684,7 @@ def step_hyperbolic(self,solution): if self.dimensional_split: #Right now only Godunov-dimensional-splitting is implemented. #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack. - #""" + q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ @@ -699,27 +699,6 @@ def step_hyperbolic(self,solution): mx,my,mz,q,q,self.auxbc,dx,dy,dz,self.dt,self._method,\ self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ self.fwave,rpn3,rpt3,rptt3) - #""" - - """ - #print "No Dim Splitting..." - # No Dimensional Splitting, Similar to SharpClaw - dq = qnew.copy('F') - dq[...] = 0. - dq, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ - mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ - self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ - self.fwave,rpn3,rpt3,rptt3) - dq, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ - mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ - self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\ - self.fwave,rpn3,rpt3,rptt3) - dq, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ - mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ - self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ - self.fwave,rpn3,rpt3,rptt3) - self.qbc += dq - """ cfl = max(cfl_x,cfl_y,cfl_z) diff --git a/src/pyclaw/io/hdf5.py b/src/pyclaw/io/hdf5.py index 8dbde118a..390e5e4ee 100644 --- a/src/pyclaw/io/hdf5.py +++ b/src/pyclaw/io/hdf5.py @@ -193,11 +193,11 @@ def read(solution,frame,path='./',file_prefix='claw',read_aux=True, state = pyclaw.state.State(pyclaw_patch, \ patch.attrs['num_eqn'],patch.attrs['num_aux']) state.t = patch.attrs['t'] - state.q = patch['q'][:].ravel(order='F').reshape(state.q.shape,order='F') + state.q = patch['q'][:].reshape(state.q.shape,order='F') # Read in aux if applicable if read_aux and patch.get('aux',None) is not None: - state.aux = patch['aux'][:].ravel(order='F').reshape(state.aux.shape,order='F') + state.aux = patch['aux'][:].reshape(state.aux.shape,order='F') solution.states.append(state) patches.append(pyclaw_patch) diff --git a/src/pyclaw/sharpclaw/solver.py b/src/pyclaw/sharpclaw/solver.py index 4158d5861..3a42282b7 100644 --- a/src/pyclaw/sharpclaw/solver.py +++ b/src/pyclaw/sharpclaw/solver.py @@ -284,7 +284,6 @@ def step(self,solution,take_one_step,tstart,tend): Take one step with a Runge-Kutta or multistep method as specified by `solver.time_integrator`. """ - state = solution.states[0] step_index = self.status['numsteps'] + 1 if self.accept_step == True: @@ -516,19 +515,9 @@ def check_3rd_ord_cond(self,state,step_index,dtFE): def _set_mthlim(self): self._mthlim = self.limiters if not isinstance(self.limiters,list): self._mthlim=[self._mthlim] - if self.lim_type==1: - if self.char_decomp == 0 or self.char_decomp == 3: - if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_eqn - if len(self._mthlim)!=self.num_eqn: - raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_eqn') - elif self.char_decomp == 1 or self.char_decomp == 4: - if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves - if len(self._mthlim)!=self.num_waves: - raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves') - else: - if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves - if len(self._mthlim)!=self.num_waves: - raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves') + if len(self._mthlim)==1: self._mthlim = self._mthlim * self.num_waves + if len(self._mthlim)!=self.num_waves: + raise Exception('Length of solver.limiters is not equal to 1 or to solver.num_waves') def dq(self,state): @@ -537,9 +526,7 @@ def dq(self,state): """ deltaq = self.dq_hyperbolic(state) - - #state.q += deltaq - #deltaq[...] = 0. + if self.dq_src is not None: deltaq+=self.dq_src(self,state,self.dt) diff --git a/src/pyclaw/solver.py b/src/pyclaw/solver.py index e73209f3b..5bc708c6a 100644 --- a/src/pyclaw/solver.py +++ b/src/pyclaw/solver.py @@ -32,14 +32,6 @@ def before_step(solver,solution): """ pass -def after_step(solver,solution): - r""" - Dummy routine called after each step - - Replace this routine if you want to do something after each time step. - """ - pass - class Solver(object): r""" Pyclaw solver superclass. @@ -93,13 +85,6 @@ class Solver(object): def before_step(solver,solution) - .. attribute:: after_step - - Function called after each time step is taken. - The required signature for this function is: - - def after_step(solver,solution) - .. attribute:: dt_variable Whether to allow the time step to vary, ``default = True``. @@ -194,7 +179,6 @@ def __init__(self,riemann_solver=None,claw_package=None): self._use_old_bc_sig = False self.accept_step = True self.before_step = None - self.after_step = None # select package to build solver objects from, by default this will be # the package that contains the module implementing the derived class @@ -626,9 +610,6 @@ def evolve_to_time(self,solution,tend=None): # Note that the solver may alter dt during the step() routine self.step(solution,take_one_step,tstart,tend) - if self.after_step is not None: - self.after_step(self,solution.states[0]) - # Check to make sure that the Courant number was not too large cfl = self.cfl.get_cached_max() self.accept_step = self.accept_reject_step(state) @@ -661,7 +642,7 @@ def evolve_to_time(self,solution,tend=None): self.status['cflmax'] = \ max(cfl, self.status['cflmax']) raise Exception('CFL too large, giving up!') - + # See if we are finished yet if solution.t >= tend or take_one_step: break diff --git a/src/pyclaw/state.py b/src/pyclaw/state.py index 9851fd657..c47dbc4d1 100755 --- a/src/pyclaw/state.py +++ b/src/pyclaw/state.py @@ -189,7 +189,7 @@ def is_valid(self): valid = False if self.aux is not None: if not self.aux.flags['F_CONTIGUOUS']: - logger.debug('aux array is not Fortran contiguous.') + logger.debug('q array is not Fortran contiguous.') valid = False return valid From 7a87fef924707c98fb5116c7929065a3df79ebdc Mon Sep 17 00:00:00 2001 From: Wes Lowrie ARA/SED Date: Mon, 27 Nov 2017 10:17:35 -0800 Subject: [PATCH 19/19] after_step readded to commit, and comments to be able to quickly switch from dimensional split to non-dimensional split solver. --- src/pyclaw/classic/solver.py | 23 +++++++++++++++++++++++ src/pyclaw/solver.py | 20 ++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/src/pyclaw/classic/solver.py b/src/pyclaw/classic/solver.py index 2b91c234e..20d3b030f 100644 --- a/src/pyclaw/classic/solver.py +++ b/src/pyclaw/classic/solver.py @@ -685,6 +685,7 @@ def step_hyperbolic(self,solution): #Right now only Godunov-dimensional-splitting is implemented. #Strang-dimensional-splitting could be added following dimsp3.f in Clawpack. + #""" q, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ mx,my,mz,qold,qnew,self.auxbc,dx,dy,dz,self.dt,self._method,\ self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ @@ -700,6 +701,28 @@ def step_hyperbolic(self,solution): self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ self.fwave,rpn3,rpt3,rptt3) + #""" + + """ + #print "No Dim Splitting..." + # No Dimensional Splitting, Similar to SharpClaw + dq = qnew.copy('F') + dq[...] = 0. + dq, cfl_x = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,1,\ + self.fwave,rpn3,rpt3,rptt3) + dq, cfl_y = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,2,\ + self.fwave,rpn3,rpt3,rptt3) + dq, cfl_z = self.fmod.step3ds(maxm,self.nthreads,self.num_ghost,\ + mx,my,mz,qold,dq,self.auxbc,dx,dy,dz,self.dt,self._method,\ + self._mthlim,self.aux1,self.aux2,self.aux3,self.work,3,\ + self.fwave,rpn3,rpt3,rptt3) + self.qbc += dq + """ + cfl = max(cfl_x,cfl_y,cfl_z) else: diff --git a/src/pyclaw/solver.py b/src/pyclaw/solver.py index 5bc708c6a..27a2acec5 100644 --- a/src/pyclaw/solver.py +++ b/src/pyclaw/solver.py @@ -32,6 +32,14 @@ def before_step(solver,solution): """ pass +def after_step(solver,solution): + r""" + Dummy routine called after each step + + Replace this routine if you want to do something after each time step. + """ + pass + class Solver(object): r""" Pyclaw solver superclass. @@ -85,6 +93,14 @@ class Solver(object): def before_step(solver,solution) + .. attribute:: after_step + + Function called after each time step is taken. + The required signature for this function is: + + def after_step(solver,solution) + + .. attribute:: dt_variable Whether to allow the time step to vary, ``default = True``. @@ -179,6 +195,7 @@ def __init__(self,riemann_solver=None,claw_package=None): self._use_old_bc_sig = False self.accept_step = True self.before_step = None + self.after_step = None # select package to build solver objects from, by default this will be # the package that contains the module implementing the derived class @@ -610,6 +627,9 @@ def evolve_to_time(self,solution,tend=None): # Note that the solver may alter dt during the step() routine self.step(solution,take_one_step,tstart,tend) + if self.after_step is not None: + self.after_step(self,solution.states[0]) + # Check to make sure that the Courant number was not too large cfl = self.cfl.get_cached_max() self.accept_step = self.accept_reject_step(state)