From a4b45fd76c91cd827365faa5167c9dc5d19c82f1 Mon Sep 17 00:00:00 2001 From: johnmauff Date: Tue, 2 Jul 2024 13:46:29 -0600 Subject: [PATCH 01/14] Initial version with revised H^t operator --- src/CostFunction3D.cpp | 130 +++++++++++++++++++++++++++++++++++++++-- src/CostFunction3D.h | 14 ++++- 2 files changed, 137 insertions(+), 7 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 61f2cb3..7ac2e7f 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -98,11 +98,18 @@ void CostFunction3D::finalize() delete[] mPtr; delete[] mVal; delete[] I2H; + #pragma acc exit data delete(H,JH,IH) delete[] H; delete[] JH; delete[] IH; + #pragma acc exit data delete(Ht,JHt,IHt) + delete[] Ht; + delete[] JHt; + delete[] IHt; + + if (basisappx > 0) { delete[] basis0; delete[] basis1; @@ -480,6 +487,7 @@ void CostFunction3D::initState(const int iteration) } // end of iteration == 1 + //JMD variable-interleave for (int var = 0; var < varDim; var++) { // Init node variance for (int iIndex = 0; iIndex < iDim; iIndex++) { @@ -494,6 +502,7 @@ void CostFunction3D::initState(const int iteration) } } + //JMD variable-interleave // Compute and display the variable BG errors and RMS of values for (int var = 0; var < varDim; var++) { real varScale = 0; @@ -535,7 +544,23 @@ void CostFunction3D::initState(const int iteration) cout << "Beginning analysis...\n"; // HTd - calcHTranspose(innovation, stateC); + calcHTranspose2(innovation, stateC); + // calcHTranspose2(innovation, stateC2); + /* + exit(1); + + cout << "CostFunction3D::initState: " << endl; + + for (int i=120;i<140;i++) { + // if(stateC[i] != stateC2[i]) { + cout << i << " " << stateC[i] << " " << stateC2[i] << endl;} + // } + for (int i=0;i< 100; i++) { + cout << "IHt[" << i << "]:= " << IHt[i] << endl;} + for (int i=0;i< 100; i++) { + cout << "JHt[" << i << "]:= " << JHt[i] << endl;} + */ + #pragma acc data create(stateB[0:nState]) { @@ -606,7 +631,7 @@ void CostFunction3D::funcGradient(real* state, real* gradient) GPTLstart("CostFunction3D::funcGradient:calcHTranspose"); // HTHCq - calcHTranspose(HCq, stateC); + calcHTranspose2(HCq, stateC); GPTLstop("CostFunction3D::funcGradient:calcHTranspose"); GPTLstart("CostFunction3D::funcGradient:FFtransform"); @@ -671,7 +696,7 @@ real CostFunction3D::funcValueAndGradient(real* state, real *gradient) J = 0.5*(qIP + obIP); //Now Gradient (also uses HCq) - calcHTranspose(HCq, stateC); + calcHTranspose2(HCq, stateC); FFtransform(stateC, stateA); SAtransform(stateA, stateB); SCtransform(stateB, stateC); @@ -699,7 +724,7 @@ void CostFunction3D::funcHessian(real* x, real *hessian) //calc HCx (store in global variable HCq) updateHCq(x,HCq); - calcHTranspose(HCq, stateC); + calcHTranspose2(HCq, stateC); FFtransform(stateC, stateA); SAtransform(stateA, stateB); SCtransform(stateB, stateC); @@ -747,6 +772,7 @@ void CostFunction3D::updateBG() std::string cFilename = outputPath + "/samurai_Coefficients.out"; ofstream cstream(cFilename); + //JMD variable-interleave cstream << "Variable\tI\tJ\tK\tBackground\tAnalysis\tIncrement\n"; for (int var = 0; var < varDim; var++) { for (int iIndex = 0; iIndex < iDim; iIndex++) { @@ -826,6 +852,60 @@ void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) } } +void CostFunction3D::calcHTranspose2(const real* yhat, real* Astate) +{ + integer j,n,m,k,ms,me; + integer begin,end; + real tmp,val; + + #pragma acc data present(yhat,Astate) + { + GPTLstart("CostFunction3D::calcHTranspose2"); + // Multiply the state by the observation matrix + #pragma omp parallel for private(n,m,j,tmp,begin,end) //[9] + #pragma acc parallel loop vector gang vector_length(32) private(n,m,j,tmp,begin,end) //[9] + for(n=0; nms){ + for (k=ms;k0;i--) {IHt[i] = IHt[i-1];} + IHt[0]=0; + // // copy H matrix stuff to the GPU Device #pragma acc enter data copyin(mPtr,mVal,I2H) #pragma acc enter data copyin(H[:nonzeros]) + #pragma acc enter data copyin(Ht[:nonzeros]) cout << "Memory usage for [obsVector] (Mbytes): " << sizeof(real)*(mObs*(7+varDim*derivDim))/(1024.0*1024.0) << "\n"; cout << "Memory usage for [obsData] (Mbytes): " << sizeof(real)*(mObs)/(1024.0*1024.0) << "\n"; cout << "Memory usage for [HCq] (Mbytes): " << sizeof(real)*(mObs+(iDim*jDim*kDim))/(1024.0*1024.0) << "\n"; - cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(integer)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; cout << "Memory usage for [IH,JH] (Mbytes): " << sizeof(integer)*(mObs+nonzeros+1)/(1024.*1024.) << "\n"; + cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(integer)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; + cout << "Memory usage for [IHt,JHt] (Mbytes): " << sizeof(integer)*(nState+nonzeros+1)/(1024.*1024.) << "\n"; cout << "Memory usage for [state] (Mbytes): " << sizeof(real)*(nState)/(1024.*1024.) << "\n"; delete[] Hlength; diff --git a/src/CostFunction3D.h b/src/CostFunction3D.h index 63ebc00..b94f239 100644 --- a/src/CostFunction3D.h +++ b/src/CostFunction3D.h @@ -71,6 +71,7 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi bool SAtranspose(const real* Astate, real* Bstate); void calcInnovation(); void calcHTranspose(const real* yhat, real* Astate); + void calcHTranspose2(const real* yhat, real* Astate); virtual bool outputAnalysis(const std::string& suffix, real* Astate) = 0; void SBtransform(const real* Ustate, real* Bstate); void SBtranspose(const real* Bstate, real* Ustate); @@ -151,9 +152,18 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi double *iFFTin, *jFFTin, *kFFTin; fftw_complex *iFFTout, *jFFTout, *kFFTout; bool UseFFT; + + // explicitly store the H matrix in CSR format real *H; - integer *IH, *I2H,*JH; - integer *mPtr, *mVal; + integer *IH, *JH; + + // Arrays to access the H matrix for H^t operator + integer *I2H; + integer *mPtr, *mVal; + + // explicity store the H^t matrix in CSR format + real *Ht; + integer *IHt,*JHt; int basisappx; real* basis0; From bb4b7c661f6a384ca6621994e27901e065b325f4 Mon Sep 17 00:00:00 2001 From: johnmauff Date: Tue, 2 Jul 2024 15:42:56 -0600 Subject: [PATCH 02/14] Next step in Memory reduction. --- src/CostFunction3D.cpp | 46 +++++++++++++++++++++++++----------------- src/CostFunction3D.h | 11 ++++++---- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 7ac2e7f..48f33f2 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -826,7 +826,8 @@ void CostFunction3D::calcInnovation() void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) { - integer j,n,m,k,ms,me; + integer n,k; // uint32_t + integer j,m,ms,me; // uint64_t real tmp,val; #pragma acc data present(yhat,Astate,mPtr,mVal,I2H,H) { @@ -854,8 +855,8 @@ void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) void CostFunction3D::calcHTranspose2(const real* yhat, real* Astate) { - integer j,n,m,k,ms,me; - integer begin,end; + integer n,m; // uint64_t + integer j,begin,end; // uint32_t real tmp,val; #pragma acc data present(yhat,Astate) @@ -2585,16 +2586,16 @@ void CostFunction3D::FFtransform(const real* Astate, real* Cstate) void CostFunction3D::calcHmatrix() { int n; - integer hi,m,mi; + integer hi,m,mi; // uint64_t real i,j,k; int ii,jj,kk,d,var; - integer cIndex,wgt_index; + integer cIndex,wgt_index; // uint64_t int iNode,jNode,kNode; int iiNode,jjNode,kkNode; int iis,iie,jjs,jje,kks,kke; int *Hlength; - integer *mTmp, *mIncr; - integer dst; + integer *mTmp, *mIncr; // uint64_t + integer dst; // uint64_t real ibasis,jbasis,kbasis; real weight; @@ -2607,10 +2608,10 @@ void CostFunction3D::calcHmatrix() //GPTLstart("CostFunction3D::calcHmatrix:allocate"); Hlength = new int[mObs]; - mPtr = new integer[nState+1]; + mPtr = new integer[nState+1]; // uint64_t - IH = new integer [mObs+1]; - IHt = new integer [nState+1]; + IH = new integer [mObs+1]; // uint64_t + IHt = new integer [nState+1]; // uint64_t //JMD variable-interleave //GPTLstart("CostFunction3D::calcHmatrix:nonzeros"); @@ -2655,7 +2656,7 @@ void CostFunction3D::calcHmatrix() Hlength[m]=hi; } - integer nonzeros = 0; + integer nonzeros = 0; // uint64_t for (int m = 0; m < mObs; m++) { nonzeros += Hlength[m]; } @@ -2668,12 +2669,12 @@ void CostFunction3D::calcHmatrix() H = new real[nonzeros]; Ht = new real[nonzeros]; - JH = new integer [nonzeros]; - JHt = new integer [nonzeros]; + JH = new integer [nonzeros]; // uint32_t + JHt = new integer [nonzeros]; // uint32_t - mVal = new integer [nonzeros]; - mTmp = new integer [nonzeros]; - mIncr = new integer [nState]; + mVal = new integer [nonzeros]; // uint64_t + mTmp = new integer [nonzeros]; // uint64_t + mIncr = new integer [nState]; // uint64_t hi=0; for (n=0;n Date: Wed, 3 Jul 2024 09:11:21 -0600 Subject: [PATCH 03/14] More explicit definitions of ints. --- ncar_scripts/TDRP/hurricane_4panel.tdrp | 2 +- src/CostFunction.h | 2 +- src/CostFunction3D.cpp | 112 +++++++++++++----------- src/CostFunction3D.h | 16 ++-- src/precision.h | 1 + 5 files changed, 74 insertions(+), 59 deletions(-) diff --git a/ncar_scripts/TDRP/hurricane_4panel.tdrp b/ncar_scripts/TDRP/hurricane_4panel.tdrp index 88efea1..30430a7 100644 --- a/ncar_scripts/TDRP/hurricane_4panel.tdrp +++ b/ncar_scripts/TDRP/hurricane_4panel.tdrp @@ -168,7 +168,7 @@ projection = PROJ_TRANSVERSE_MERCATOR_EXACT; // // Type: string -data_directory = "/glade/campaign/cisl/asap/samurai/data/special/hurricane"; +data_directory = "/glade/campaign/cisl/asap/samurai/data/special/hurricane_4panel"; ///////////// output_directory //////////////////////// // diff --git a/src/CostFunction.h b/src/CostFunction.h index 86d45a4..065fad2 100644 --- a/src/CostFunction.h +++ b/src/CostFunction.h @@ -28,7 +28,7 @@ class CostFunction protected: int ls_cnt; - int64_t mObs; + uint64_t mObs; int nState; real* currState; real* currGradient; diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 48f33f2..dde5b88 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -248,7 +248,7 @@ void CostFunction3D::initialize(HashMap* config, // These are local to this one CTHTd = new real[nState]; stateU = new real[nState]; - int64_t vector_size = mObs*(7+varDim*derivDim); + uint64_t vector_size = mObs*(7+varDim*derivDim); obsVector = new real[vector_size]; obsData = new real[mObs]; HCq = new real[mObs+nodes]; @@ -461,7 +461,7 @@ void CostFunction3D::initState(const int iteration) if ((*configHash)["load_bg_coefficients"] == "true") { - for (int64_t n = 0; n < nState; n++) { + for (uint64_t n = 0; n < nState; n++) { stateA[n] = bgFields[n]; } } else { @@ -494,7 +494,7 @@ void CostFunction3D::initState(const int iteration) for (int jIndex = 0; jIndex < jDim; jIndex++) { for (int kIndex = 0; kIndex < kDim; kIndex++) { // int bIndex = varDim * iDim * jDim*kIndex + varDim * iDim * jIndex + varDim * iIndex + var; - int64_t bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); + uint64_t bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); // *bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); bgStdDev[bIndex] = variance.meshValueAt(var, iIndex, jIndex, kIndex); } @@ -510,7 +510,7 @@ void CostFunction3D::initState(const int iteration) for (int jIndex = 0; jIndex < jDim; jIndex++) { for (int kIndex = 0; kIndex < kDim; kIndex++) { // int bIndex = varDim * iDim * jDim * kIndex + varDim * iDim * jIndex + varDim * iIndex + var; - int64_t bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); + uint64_t bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); // int64_t *bIndex = (int64_t *)malloc(sizeof(int64_t)); // *bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); varScale += bgState[bIndex] * bgState[bIndex]; @@ -826,8 +826,8 @@ void CostFunction3D::calcInnovation() void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) { - integer n,k; // uint32_t - integer j,m,ms,me; // uint64_t + uint32_t n,k; // uint32_t + uint64_t j,m,ms,me; // uint64_t real tmp,val; #pragma acc data present(yhat,Astate,mPtr,mVal,I2H,H) { @@ -855,8 +855,8 @@ void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) void CostFunction3D::calcHTranspose2(const real* yhat, real* Astate) { - integer n,m; // uint64_t - integer j,begin,end; // uint32_t + uint64_t n,m; // uint64_t + uint32_t j,begin,end; // uint32_t real tmp,val; #pragma acc data present(yhat,Astate) @@ -1323,7 +1323,7 @@ void CostFunction3D::SBtransform(const real* Ustate, real* Bstate) void CostFunction3D::SBtransform(const real* Ustate, real* Bstate) { // Clear the Bstate - for (int64_t n = 0; n < nState; n++) { + for (uint64_t n = 0; n < nState; n++) { Bstate[n] = 0.; } real gausspoint = 0.5*sqrt(1./3.); @@ -1360,13 +1360,13 @@ void CostFunction3D::SBtransform(const real* Ustate, real* Bstate) real kbasis = Basis(kNode, k, kDim-1, kMin, DK, DKrecip, 0, kBCL[var], kBCR[var]); real ijkbasis = 0.125 * ijbasis * kbasis; int uK = kIndex*2 + (kmu+1)/2; - int64_t uIndex = varDim * (iDim - 1) * 2 * (jDim - 1) * 2 * uK + uint64_t uIndex = varDim * (iDim - 1) * 2 * (jDim - 1) * 2 * uK + varDim * (iDim -1 ) * 2 * uJ + varDim * uI; - int64_t bIndex = varDim*iDim*jDim*kNode + varDim*iDim*jNode +varDim*iNode; + uint64_t bIndex = varDim*iDim*jDim*kNode + varDim*iDim*jNode +varDim*iNode; - int64_t ui = uIndex + var; + uint64_t ui = uIndex + var; if (Ustate[ui] == 0) continue; - int64_t bi = bIndex + var; + uint64_t bi = bIndex + var; //#pragma omp atomic Bstate[bi] += Ustate[ui] * ijkbasis; } @@ -2586,16 +2586,16 @@ void CostFunction3D::FFtransform(const real* Astate, real* Cstate) void CostFunction3D::calcHmatrix() { int n; - integer hi,m,mi; // uint64_t + uint64_t hi,m,mi; // uint64_t real i,j,k; int ii,jj,kk,d,var; - integer cIndex,wgt_index; // uint64_t + uint64_t cIndex,wgt_index; // uint64_t int iNode,jNode,kNode; int iiNode,jjNode,kkNode; int iis,iie,jjs,jje,kks,kke; int *Hlength; - integer *mTmp, *mIncr; // uint64_t - integer dst; // uint64_t + uint64_t *mTmp, *mIncr; // uint64_t + uint64_t dst; // uint64_t real ibasis,jbasis,kbasis; real weight; @@ -2608,10 +2608,10 @@ void CostFunction3D::calcHmatrix() //GPTLstart("CostFunction3D::calcHmatrix:allocate"); Hlength = new int[mObs]; - mPtr = new integer[nState+1]; // uint64_t + mPtr = new uint64_t [nState+1]; // uint64_t - IH = new integer [mObs+1]; // uint64_t - IHt = new integer [nState+1]; // uint64_t + IH = new uint64_t [mObs+1]; // uint64_t + IHt = new uint64_t [nState+1]; // uint64_t //JMD variable-interleave //GPTLstart("CostFunction3D::calcHmatrix:nonzeros"); @@ -2655,26 +2655,37 @@ void CostFunction3D::calcHmatrix() } Hlength[m]=hi; } - - integer nonzeros = 0; // uint64_t - for (int m = 0; m < mObs; m++) { + std::cout << "CostFunction3D:: After big loop " << std::endl; + uint64_t nonzeros = 0; // uint64_t + for (m = 0; m < mObs; m++) { nonzeros += Hlength[m]; } //GPTLstop("CostFunction3D::calcHmatrix:nonzeros"); + std::cout << "CostFunction3D:: nonzeros " << nonzeros << std::endl; + std::cout << "CostFunction3D:: mObs " << mObs << std::endl; IH[mObs] = nonzeros; - std::cout << "sizeof(integer): " << sizeof(integer) << "\n"; - std::cout << "Non-zero entries in sparse H matrix: " << nonzeros << " = " << 100.0*float(nonzeros)/(float(mObs)*float(nState)) << " %\n"; + std::cout << "CostFunction3D:: after IH[mObs] " << IH[mObs] << std::endl; + // std::cout << "sizeof(integer): " << sizeof(integer) << "\n"; + // std::cout << "Non-zero entries in sparse H matrix: " << nonzeros << " = " << 100.0*float(nonzeros)/(float(mObs)*float(nState)) << " %\n"; std::cout << "Memory usage for [H] (Mbytes): " << sizeof(real)*(nonzeros)/(1024.0*1024.0) << "\n"; H = new real[nonzeros]; + std::cout << "After allocation of H" << std::endl; Ht = new real[nonzeros]; + std::cout << "After allocation of Ht" << std::endl; + - JH = new integer [nonzeros]; // uint32_t - JHt = new integer [nonzeros]; // uint32_t + JH = new uint32_t [nonzeros]; // uint32_t + std::cout << "After allocation of JH" << std::endl; + JHt = new uint32_t [nonzeros]; // uint32_t + std::cout << "After allocation of JHt" << std::endl; - mVal = new integer [nonzeros]; // uint64_t - mTmp = new integer [nonzeros]; // uint64_t - mIncr = new integer [nState]; // uint64_t + mVal = new uint64_t [nonzeros]; // uint64_t + std::cout << "After allocation of mVal" << std::endl; + mTmp = new uint64_t [nonzeros]; // uint64_t + std::cout << "After allocation of mTmp" << std::endl; + mIncr = new uint64_t [nState]; // uint64_t + std::cout << "After allocation of mIncr" << std::endl; hi=0; for (n=0;n0;i--) {IHt[i] = IHt[i-1];} + for(uint32_t i=nState;i>0;i--) {IHt[i] = IHt[i-1];} IHt[0]=0; + std::cout << "After construction of H^t" << std::endl; // // copy H matrix stuff to the GPU Device @@ -2770,14 +2784,14 @@ void CostFunction3D::calcHmatrix() cout << "Memory usage for [obsData] (Mbytes): " << sizeof(real)*(mObs)/(1024.0*1024.0) << "\n"; cout << "Memory usage for [HCq] (Mbytes): " << sizeof(real)*(mObs+(iDim*jDim*kDim))/(1024.0*1024.0) << "\n"; - // cout << "Memory usage for [IH,JH] (Mbytes): " << (sizeof(uint64_t)*(mObs+1)+ sizeof(uint32_t)*nonzeros)/(1024.*1024.) << "\n"; - cout << "Memory usage for [IH,JH] (Mbytes): " << sizeof(integer)*(mObs+nonzeros+1)/(1024.*1024.) << "\n"; + cout << "Memory usage for [IH,JH] (Mbytes): " << (sizeof(uint64_t)*(mObs+1)+ sizeof(int32_t)*nonzeros)/(1024.*1024.) << "\n"; + // cout << "Memory usage for [IH,JH] (Mbytes): " << sizeof(integer)*(mObs+nonzeros+1)/(1024.*1024.) << "\n"; - //cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(uint64_t)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; - cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(integer)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; + cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(uint64_t)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; + // cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(integer)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; - // cout << "Memory usage for [IHt,JHt] (Mbytes): " << (sizeof(uint64_t)*(nState+1)+ sizeof(uint32_t)*nonzeros)/(1024.*1024.) << "\n"; - cout << "Memory usage for [IHt,JHt] (Mbytes): " << sizeof(integer)*(nState+nonzeros+1)/(1024.*1024.) << "\n"; + cout << "Memory usage for [IHt,JHt] (Mbytes): " << (sizeof(uint64_t)*(nState+1)+ sizeof(int32_t)*nonzeros)/(1024.*1024.) << "\n"; + // cout << "Memory usage for [IHt,JHt] (Mbytes): " << sizeof(integer)*(nState+nonzeros+1)/(1024.*1024.) << "\n"; cout << "Memory usage for [state] (Mbytes): " << sizeof(real)*(nState)/(1024.*1024.) << "\n"; @@ -2791,8 +2805,8 @@ void CostFunction3D::calcHmatrix() void CostFunction3D::Htransform(const real* Cstate, real* Hstate) { - integer i; // uint32_t - integer j,begin,end; // uint64_t + uint32_t i; // uint32_t + uint64_t j,begin,end; // uint64_t real tmp; #pragma acc data present(Cstate,Hstate) diff --git a/src/CostFunction3D.h b/src/CostFunction3D.h index 1635491..b7b33a2 100644 --- a/src/CostFunction3D.h +++ b/src/CostFunction3D.h @@ -139,7 +139,7 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi real* kGammaL; real* kLL; real* finalAnalysis; - int64_t varDim; // NCAR: promoted to 64-bit, since it should auto-promote calculations with it to 64-bit + uint64_t varDim; // NCAR: promoted to 64-bit, since it should auto-promote calculations with it to 64-bit int derivDim; real bgError[7]; int iBCL[7], iBCR[7], jBCL[7], jBCR[7], kBCL[7], kBCR[7]; @@ -155,18 +155,18 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi // explicitly store the H matrix in CSR format real *H; - integer *IH; // uint64_t - integer *JH; // uint32_t + uint64_t *IH; // uint64_t + uint32_t *JH; // uint32_t // Arrays to access the H matrix for H^t operator - integer *I2H; // uint64_t - integer *mPtr; // uint64_t - integer *mVal; // uint64_t + uint64_t *I2H; // uint64_t + uint64_t *mPtr; // uint64_t + uint64_t *mVal; // uint64_t // explicity store the H^t matrix in CSR format real *Ht; - integer *IHt; // uint64_t - integer *JHt; // uint32_t + uint64_t *IHt; // uint64_t + uint32_t *JHt; // uint32_t int basisappx; real* basis0; diff --git a/src/precision.h b/src/precision.h index aef4993..bfa14cb 100644 --- a/src/precision.h +++ b/src/precision.h @@ -14,5 +14,6 @@ typedef double real; typedef unsigned long int integer; +//typedef unsigned int integer; #endif From ac504c26ee073e7403caf4608a5fa7722620dcb8 Mon Sep 17 00:00:00 2001 From: johnmauff Date: Wed, 3 Jul 2024 10:08:45 -0600 Subject: [PATCH 04/14] Code cleanup for new CSR format for H^t matrix --- src/CostFunction3D.cpp | 110 ++++++++--------------------------------- src/CostFunction3D.h | 6 +-- 2 files changed, 24 insertions(+), 92 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index dde5b88..73c4712 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -544,23 +544,7 @@ void CostFunction3D::initState(const int iteration) cout << "Beginning analysis...\n"; // HTd - calcHTranspose2(innovation, stateC); - // calcHTranspose2(innovation, stateC2); - /* - exit(1); - - cout << "CostFunction3D::initState: " << endl; - - for (int i=120;i<140;i++) { - // if(stateC[i] != stateC2[i]) { - cout << i << " " << stateC[i] << " " << stateC2[i] << endl;} - // } - for (int i=0;i< 100; i++) { - cout << "IHt[" << i << "]:= " << IHt[i] << endl;} - for (int i=0;i< 100; i++) { - cout << "JHt[" << i << "]:= " << JHt[i] << endl;} - */ - + calcHTranspose(innovation, stateC); #pragma acc data create(stateB[0:nState]) { @@ -631,7 +615,7 @@ void CostFunction3D::funcGradient(real* state, real* gradient) GPTLstart("CostFunction3D::funcGradient:calcHTranspose"); // HTHCq - calcHTranspose2(HCq, stateC); + calcHTranspose(HCq, stateC); GPTLstop("CostFunction3D::funcGradient:calcHTranspose"); GPTLstart("CostFunction3D::funcGradient:FFtransform"); @@ -696,7 +680,7 @@ real CostFunction3D::funcValueAndGradient(real* state, real *gradient) J = 0.5*(qIP + obIP); //Now Gradient (also uses HCq) - calcHTranspose2(HCq, stateC); + calcHTranspose(HCq, stateC); FFtransform(stateC, stateA); SAtransform(stateA, stateB); SCtransform(stateB, stateC); @@ -724,7 +708,7 @@ void CostFunction3D::funcHessian(real* x, real *hessian) //calc HCx (store in global variable HCq) updateHCq(x,HCq); - calcHTranspose2(HCq, stateC); + calcHTranspose(HCq, stateC); FFtransform(stateC, stateA); SAtransform(stateA, stateB); SCtransform(stateB, stateC); @@ -824,7 +808,7 @@ void CostFunction3D::calcInnovation() GPTLstop("CostFunction3D::calcInnovation"); } -void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) +void CostFunction3D::calcHTransposeOLD(const real* yhat, real* Astate) { uint32_t n,k; // uint32_t uint64_t j,m,ms,me; // uint64_t @@ -853,7 +837,7 @@ void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) } } -void CostFunction3D::calcHTranspose2(const real* yhat, real* Astate) +void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) { uint64_t n,m; // uint64_t uint32_t j,begin,end; // uint32_t @@ -861,7 +845,7 @@ void CostFunction3D::calcHTranspose2(const real* yhat, real* Astate) #pragma acc data present(yhat,Astate) { - GPTLstart("CostFunction3D::calcHTranspose2"); + GPTLstart("CostFunction3D::calcHTranspose"); // Multiply the state by the observation matrix #pragma omp parallel for private(n,m,j,tmp,begin,end) //[9] #pragma acc parallel loop vector gang vector_length(32) private(n,m,j,tmp,begin,end) //[9] @@ -871,44 +855,15 @@ void CostFunction3D::calcHTranspose2(const real* yhat, real* Astate) for(j=begin; jms){ - for (k=ms;k0;i--) {IHt[i] = IHt[i-1];} IHt[0]=0; - std::cout << "After construction of H^t" << std::endl; + std::cout << "CostFunction3D::calcHmatrix: After construction of H^t" << std::endl; // // copy H matrix stuff to the GPU Device #pragma acc enter data copyin(mPtr,mVal,I2H) #pragma acc enter data copyin(H[:nonzeros]) #pragma acc enter data copyin(Ht[:nonzeros]) - cout << "Memory usage for [obsVector] (Mbytes): " << sizeof(real)*(mObs*(7+varDim*derivDim))/(1024.0*1024.0) << "\n"; - cout << "Memory usage for [obsData] (Mbytes): " << sizeof(real)*(mObs)/(1024.0*1024.0) << "\n"; - cout << "Memory usage for [HCq] (Mbytes): " << sizeof(real)*(mObs+(iDim*jDim*kDim))/(1024.0*1024.0) << "\n"; - - cout << "Memory usage for [IH,JH] (Mbytes): " << (sizeof(uint64_t)*(mObs+1)+ sizeof(int32_t)*nonzeros)/(1024.*1024.) << "\n"; - // cout << "Memory usage for [IH,JH] (Mbytes): " << sizeof(integer)*(mObs+nonzeros+1)/(1024.*1024.) << "\n"; - - cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(uint64_t)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; - // cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(integer)*(nState+2.*nonzeros+1)/(1024.*1024.) << "\n"; - - cout << "Memory usage for [IHt,JHt] (Mbytes): " << (sizeof(uint64_t)*(nState+1)+ sizeof(int32_t)*nonzeros)/(1024.*1024.) << "\n"; - // cout << "Memory usage for [IHt,JHt] (Mbytes): " << sizeof(integer)*(nState+nonzeros+1)/(1024.*1024.) << "\n"; - - cout << "Memory usage for [state] (Mbytes): " << sizeof(real)*(nState)/(1024.*1024.) << "\n"; + std::cout << "Memory usage for [obsVector] (Mbytes): " << sizeof(real)*(mObs*(7+varDim*derivDim))/(1024.0*1024.0) << std::endl; + std::cout << "Memory usage for [obsData] (Mbytes): " << sizeof(real)*(mObs)/(1024.0*1024.0) << std::endl; + std::cout << "Memory usage for [HCq] (Mbytes): " << sizeof(real)*(mObs+(iDim*jDim*kDim))/(1024.0*1024.0) << std::endl; + std::cout << "Memory usage for [IH,JH] (Mbytes): " << (sizeof(uint64_t)*(mObs+1)+ sizeof(int32_t)*nonzeros)/(1024.*1024.) << std::endl; + std::cout << "Memory usage for [mPtr,mVal,I2H] (Mbytes): " << sizeof(uint64_t)*(nState+2.*nonzeros+1)/(1024.*1024.) << std::endl; + std::cout << "Memory usage for [IHt,JHt] (Mbytes): " << (sizeof(uint64_t)*(nState+1)+ sizeof(int32_t)*nonzeros)/(1024.*1024.) << std::endl; + std::cout << "Memory usage for [state] (Mbytes): " << sizeof(real)*(nState)/(1024.*1024.) << std::endl; delete[] Hlength; delete[] mIncr; diff --git a/src/CostFunction3D.h b/src/CostFunction3D.h index b7b33a2..5f084e7 100644 --- a/src/CostFunction3D.h +++ b/src/CostFunction3D.h @@ -70,8 +70,8 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi bool SAtransform(const real* Bstate, real* Astate); bool SAtranspose(const real* Astate, real* Bstate); void calcInnovation(); + void calcHTransposeOLD(const real* yhat, real* Astate); void calcHTranspose(const real* yhat, real* Astate); - void calcHTranspose2(const real* yhat, real* Astate); virtual bool outputAnalysis(const std::string& suffix, real* Astate) = 0; void SBtransform(const real* Ustate, real* Bstate); void SBtranspose(const real* Bstate, real* Ustate); @@ -121,7 +121,7 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi real* bgState; real* bgStdDev; real* obsVector; - real* obsData; // This only contains data that is needed by the calcHTranspose2 subroutine + real* obsData; // This only contains the necessary data for the calcHTranspose subroutine real* rawObs; real* stateA; real* stateB; @@ -136,7 +136,7 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi real* iGamma[7]; real* jGamma[7]; real* kGamma[7]; - real* kGammaL; + real* kGammaL; real* kLL; real* finalAnalysis; uint64_t varDim; // NCAR: promoted to 64-bit, since it should auto-promote calculations with it to 64-bit From c9a61afcaa006c4a9575e3053eb98a86655228dc Mon Sep 17 00:00:00 2001 From: johnmauff Date: Mon, 8 Jul 2024 17:05:56 -0600 Subject: [PATCH 05/14] Verified that this works for the hurricane_4panel case. --- src/CostFunction3D.cpp | 124 ++++++++++++++++++++++++++--------------- src/CostFunction3D.h | 6 +- 2 files changed, 82 insertions(+), 48 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 73c4712..6c22310 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -5,6 +5,7 @@ * Copyright 2008 Michael Bell. All rights reserved. * */ +#define NEWHT 1 #include #include @@ -248,7 +249,7 @@ void CostFunction3D::initialize(HashMap* config, // These are local to this one CTHTd = new real[nState]; stateU = new real[nState]; - uint64_t vector_size = mObs*(7+varDim*derivDim); + int64_t vector_size = mObs*(7+varDim*derivDim); obsVector = new real[vector_size]; obsData = new real[mObs]; HCq = new real[mObs+nodes]; @@ -461,7 +462,7 @@ void CostFunction3D::initState(const int iteration) if ((*configHash)["load_bg_coefficients"] == "true") { - for (uint64_t n = 0; n < nState; n++) { + for (int64_t n = 0; n < nState; n++) { stateA[n] = bgFields[n]; } } else { @@ -494,7 +495,7 @@ void CostFunction3D::initState(const int iteration) for (int jIndex = 0; jIndex < jDim; jIndex++) { for (int kIndex = 0; kIndex < kDim; kIndex++) { // int bIndex = varDim * iDim * jDim*kIndex + varDim * iDim * jIndex + varDim * iIndex + var; - uint64_t bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); + int64_t bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); // *bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); bgStdDev[bIndex] = variance.meshValueAt(var, iIndex, jIndex, kIndex); } @@ -510,7 +511,7 @@ void CostFunction3D::initState(const int iteration) for (int jIndex = 0; jIndex < jDim; jIndex++) { for (int kIndex = 0; kIndex < kDim; kIndex++) { // int bIndex = varDim * iDim * jDim * kIndex + varDim * iDim * jIndex + varDim * iIndex + var; - uint64_t bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); + int64_t bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); // int64_t *bIndex = (int64_t *)malloc(sizeof(int64_t)); // *bIndex = INDEX(iIndex, jIndex, kIndex, iDim, jDim, varDim, var); varScale += bgState[bIndex] * bgState[bIndex]; @@ -544,7 +545,11 @@ void CostFunction3D::initState(const int iteration) cout << "Beginning analysis...\n"; // HTd +#ifdef NEWHT calcHTranspose(innovation, stateC); +#endif + calcHTransposeOLD(innovation, stateC); +#endif #pragma acc data create(stateB[0:nState]) { @@ -615,7 +620,11 @@ void CostFunction3D::funcGradient(real* state, real* gradient) GPTLstart("CostFunction3D::funcGradient:calcHTranspose"); // HTHCq +#ifdef NEWHT calcHTranspose(HCq, stateC); +#else + calcHTransposeOLD(HCq, stateC); +#endif GPTLstop("CostFunction3D::funcGradient:calcHTranspose"); GPTLstart("CostFunction3D::funcGradient:FFtransform"); @@ -680,7 +689,11 @@ real CostFunction3D::funcValueAndGradient(real* state, real *gradient) J = 0.5*(qIP + obIP); //Now Gradient (also uses HCq) +#ifdef NEWHT calcHTranspose(HCq, stateC); +#else + calcHTransposeOLD(HCq, stateC); +#endif FFtransform(stateC, stateA); SAtransform(stateA, stateB); SCtransform(stateB, stateC); @@ -708,7 +721,11 @@ void CostFunction3D::funcHessian(real* x, real *hessian) //calc HCx (store in global variable HCq) updateHCq(x,HCq); +#ifdef NEWHT calcHTranspose(HCq, stateC); +#else + calcHTransposeOLD(HCq, stateC); +#endif FFtransform(stateC, stateA); SAtransform(stateA, stateB); SCtransform(stateB, stateC); @@ -810,19 +827,20 @@ void CostFunction3D::calcInnovation() void CostFunction3D::calcHTransposeOLD(const real* yhat, real* Astate) { - uint32_t n,k; // uint32_t + uint64_t n,k; // uint32_t uint64_t j,m,ms,me; // uint64_t real tmp,val; #pragma acc data present(yhat,Astate,mPtr,mVal,I2H,H) { GPTLstart("CostFunction3D::calcHTranspose"); #pragma omp parallel for private(n,k,ms,me,tmp,m,j,val) - #pragma acc parallel loop gang vector vector_length(32) private(n,k,ms,me,tmp,m,j,val) + #pragma acc parallel loop gang vector vector_length(32) private(n,k,ms,me,tmp,m,j,val) reduction(+:tmp) for(n=0;nms){ + #pragma acc loop reduction(+:tmp) for (k=ms;k0;i--) {IHt[i] = IHt[i-1];} + std::cout << "CostFunction3D::calcHmatrix: after H^t assignment" << std::endl; + for(uint64_t i=nState;i>0;i--) {IHt[i] = IHt[i-1];} IHt[0]=0; std::cout << "CostFunction3D::calcHmatrix: After construction of H^t" << std::endl; +#else + + I2H = new uint64_t [nonzeros]; // uint64_t + mPtr[0]=0; + for (n=1;n<=nState;n++){ + mPtr[n] = mPtr[n-1]+mIncr[n-1]; + } + for (n=0;n Date: Tue, 9 Jul 2024 10:28:05 -0600 Subject: [PATCH 06/14] More bug fixes for new-H^t operator. --- src/CostFunction3D.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 6c22310..430df5d 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -547,7 +547,7 @@ void CostFunction3D::initState(const int iteration) // HTd #ifdef NEWHT calcHTranspose(innovation, stateC); -#endif +#else calcHTransposeOLD(innovation, stateC); #endif @@ -2653,7 +2653,9 @@ void CostFunction3D::calcHmatrix() std::cout << "CostFunction3D::calcHmatrix: before big loop" << std::endl; hi=0; +#ifndef NEWHT for (n=0;n Date: Tue, 9 Jul 2024 11:26:36 -0600 Subject: [PATCH 07/14] Reduced int length for JH array --- src/CostFunction3D.cpp | 11 +++++------ src/CostFunction3D.h | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 430df5d..0f7f08b 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -95,10 +95,12 @@ void CostFunction3D::finalize() delete[] stateB; delete[] stateC; // deallocate Clean-up the data that correspond to the H matrix +#ifndef NEWHT #pragma acc exit data delete(mPtr,mVal,I2H) delete[] mPtr; delete[] mVal; delete[] I2H; +#endif #pragma acc exit data delete(H,JH,IH) delete[] H; @@ -2641,7 +2643,7 @@ void CostFunction3D::calcHmatrix() std::cout << "CostFunction3D:: nonzeros " << nonzeros << std::endl; std::cout << "Memory usage for [H] (Mbytes): " << sizeof(real)*(nonzeros)/(1024.0*1024.0) << std::endl; H = new real[nonzeros]; - JH = new uint64_t [nonzeros]; // uint32_t + JH = new uint32_t [nonzeros]; // uint32_t #ifdef NEWHT Ht = new real[nonzeros]; JHt = new uint64_t [nonzeros]; // uint32_t @@ -2686,7 +2688,7 @@ void CostFunction3D::calcHmatrix() cIndex = INDEX(iNode, jNode, kNode, iDim, jDim, varDim, var); weight = ibasis * jbasis * kbasis * obsVector[wgt_index]; H[hi] = weight; - JH[hi] = cIndex; + JH[hi] = (uint32_t) cIndex; #ifndef NEWHT) mIncr[cIndex]+=1; mTmp[hi]=m; @@ -2704,10 +2706,7 @@ void CostFunction3D::calcHmatrix() std::cout << "CostFunction3D::calcHmatrix: Before construction of H^t" << std::endl; /* Calculate the indicies for the H^t matrix */ for (uint64_t n=0;n Date: Tue, 9 Jul 2024 14:31:07 -0600 Subject: [PATCH 08/14] Change integer size --- src/CostFunction3D.cpp | 8 ++++---- src/CostFunction3D.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 0f7f08b..68ec7e0 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -874,7 +874,7 @@ void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) begin = IHt[n]; end = IHt[n + 1]; #pragma acc loop reduction(+:tmp) for(j=begin; j Date: Wed, 10 Jul 2024 11:04:12 -0600 Subject: [PATCH 09/14] Cleanup calculation of nnz --- src/CostFunction3D.cpp | 68 ++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 68ec7e0..6f4dc6e 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -2563,13 +2563,13 @@ void CostFunction3D::calcHmatrix() { int n; uint64_t hi,m,mi; // uint64_t + uint64_t nnz; real i,j,k; int ii,jj,kk,d,var; uint64_t cIndex,wgt_index; // uint64_t int iNode,jNode,kNode; int iiNode,jjNode,kkNode; int iis,iie,jjs,jje,kks,kke; - int *Hlength; uint64_t *mTmp, *mIncr; // uint64_t uint64_t dst; // uint64_t @@ -2583,27 +2583,27 @@ void CostFunction3D::calcHmatrix() //GPTLstart("CostFunction3D::calcHmatrix:allocate"); - Hlength = new int[mObs]; - mPtr = new uint64_t [nState+1]; // uint64_t - IH = new uint64_t [mObs+1]; // uint64_t #ifdef NEWHT IHt = new uint64_t [nState+1]; // uint64_t +#else + mPtr = new uint64_t [nState+1]; // uint64_t #endif + nnz=0; //JMD variable-interleave - //GPTLstart("CostFunction3D::calcHmatrix:nonzeros"); + //GPTLstart("CostFunction3D::calcHmatrix:nnz"); // Determine the number of non-zeros in H //#pragma omp parallel for private(m,mi,hi,i,j,k,ii,iis,iie,jj,jjs,jje,kk,kks,kke,ibasis,jbasis,kbasis,iiNode,jjNode,kkNode,iNode,jNode,kNode,var,d,wgt_index,weight,cIndex) //[8.1] - #pragma acc parallel loop vector gang vector_length(32) copyout(Hlength[0:mObs]) private(m,mi,hi,i,j,k,ii,iis,iie,jj,jjs,jje,kk,kks,kke,ibasis,jbasis,kbasis,iiNode,jjNode,kkNode,iNode,jNode,kNode,var,d,wgt_index,weight,cIndex) + #pragma acc parallel loop vector gang vector_length(32) private(m,mi,hi,iis,iie,jjs,jje,kks,kke,ibasis,jbasis,kbasis,iiNode,jjNode,kkNode,iNode,jNode,kNode,var,d,wgt_index,weight,cIndex) reduction(+:nnz) for (m = 0; m < mObs; m++) { mi = m*(7+varDim*derivDim); - i = obsVector[mi+2]; - j = obsVector[mi+3]; - k = obsVector[mi+4]; - ii = (int)((i - iMin)*DIrecip);iis=max(0,ii-1);iie=min(ii+2,iDim-1); - jj = (int)((j - jMin)*DJrecip);jjs=max(0,jj-1);jje=min(jj+2,jDim-1); - kk = (int)((k - kMin)*DKrecip);kks=max(0,kk-1);kke=min(kk+2,kDim-1); + real i = obsVector[mi+2]; + real j = obsVector[mi+3]; + real k = obsVector[mi+4]; + int ii = (int)((i - iMin)*DIrecip);iis=max(0,ii-1);iie=min(ii+2,iDim-1); + int jj = (int)((j - jMin)*DJrecip);jjs=max(0,jj-1);jje=min(jj+2,jDim-1); + int kk = (int)((k - kMin)*DKrecip);kks=max(0,kk-1);kke=min(kk+2,kDim-1); ibasis = 0; jbasis = 0; kbasis = 0; @@ -2626,30 +2626,27 @@ void CostFunction3D::calcHmatrix() if (!kbasis) continue; // Count the number of non-zero entries in the observation matrix... hi++; + nnz++; } } } } } - Hlength[m]=hi; } - uint64_t nonzeros = 0; // uint64_t - for (m = 0; m < mObs; m++) { - nonzeros += Hlength[m]; - } - //GPTLstop("CostFunction3D::calcHmatrix:nonzeros"); - IH[mObs] = nonzeros; - std::cout << "CostFunction3D:: nonzeros " << nonzeros << std::endl; - std::cout << "Memory usage for [H] (Mbytes): " << sizeof(real)*(nonzeros)/(1024.0*1024.0) << std::endl; - H = new real[nonzeros]; - JH = new uint32_t [nonzeros]; // uint32_t + //GPTLstop("CostFunction3D::calcHmatrix:nnz"); + IH[mObs] = nnz; + + std::cout << "CostFunction3D:: nnz " << nnz << std::endl; + std::cout << "Memory usage for [H] (Mbytes): " << sizeof(real)*(nnz)/(1024.0*1024.0) << std::endl; + H = new real[nnz]; + JH = new uint32_t [nnz]; // uint32_t #ifdef NEWHT - Ht = new real[nonzeros]; - JHt = new uint32_t [nonzeros]; // uint32_t + Ht = new real[nnz]; + JHt = new uint32_t [nnz]; // uint32_t #else - mVal = new uint64_t [nonzeros]; // uint64_t - mTmp = new uint64_t [nonzeros]; // uint64_t + mVal = new uint64_t [nnz]; // uint64_t + mTmp = new uint64_t [nnz]; // uint64_t mIncr = new uint64_t [nState]; // uint64_t #endif @@ -2706,7 +2703,7 @@ void CostFunction3D::calcHmatrix() std::cout << "CostFunction3D::calcHmatrix: Before construction of H^t" << std::endl; /* Calculate the indicies for the H^t matrix */ for (uint64_t n=0;n Date: Wed, 10 Jul 2024 11:47:39 -0600 Subject: [PATCH 10/14] Removed remaining old H^t code. --- src/CostFunction3D.cpp | 96 ------------------------------------------ src/CostFunction3D.h | 5 --- 2 files changed, 101 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 6f4dc6e..02a4933 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -5,7 +5,6 @@ * Copyright 2008 Michael Bell. All rights reserved. * */ -#define NEWHT 1 #include #include @@ -95,12 +94,6 @@ void CostFunction3D::finalize() delete[] stateB; delete[] stateC; // deallocate Clean-up the data that correspond to the H matrix -#ifndef NEWHT - #pragma acc exit data delete(mPtr,mVal,I2H) - delete[] mPtr; - delete[] mVal; - delete[] I2H; -#endif #pragma acc exit data delete(H,JH,IH) delete[] H; @@ -547,11 +540,7 @@ void CostFunction3D::initState(const int iteration) cout << "Beginning analysis...\n"; // HTd -#ifdef NEWHT calcHTranspose(innovation, stateC); -#else - calcHTransposeOLD(innovation, stateC); -#endif #pragma acc data create(stateB[0:nState]) { @@ -622,11 +611,7 @@ void CostFunction3D::funcGradient(real* state, real* gradient) GPTLstart("CostFunction3D::funcGradient:calcHTranspose"); // HTHCq -#ifdef NEWHT calcHTranspose(HCq, stateC); -#else - calcHTransposeOLD(HCq, stateC); -#endif GPTLstop("CostFunction3D::funcGradient:calcHTranspose"); GPTLstart("CostFunction3D::funcGradient:FFtransform"); @@ -691,11 +676,7 @@ real CostFunction3D::funcValueAndGradient(real* state, real *gradient) J = 0.5*(qIP + obIP); //Now Gradient (also uses HCq) -#ifdef NEWHT calcHTranspose(HCq, stateC); -#else - calcHTransposeOLD(HCq, stateC); -#endif FFtransform(stateC, stateA); SAtransform(stateA, stateB); SCtransform(stateB, stateC); @@ -723,11 +704,7 @@ void CostFunction3D::funcHessian(real* x, real *hessian) //calc HCx (store in global variable HCq) updateHCq(x,HCq); -#ifdef NEWHT calcHTranspose(HCq, stateC); -#else - calcHTransposeOLD(HCq, stateC); -#endif FFtransform(stateC, stateA); SAtransform(stateA, stateB); SCtransform(stateB, stateC); @@ -827,36 +804,6 @@ void CostFunction3D::calcInnovation() GPTLstop("CostFunction3D::calcInnovation"); } -void CostFunction3D::calcHTransposeOLD(const real* yhat, real* Astate) -{ - uint64_t n,k; // uint32_t - uint64_t j,m,ms,me; // uint64_t - real tmp,val; - #pragma acc data present(yhat,Astate,mPtr,mVal,I2H,H) - { - GPTLstart("CostFunction3D::calcHTranspose"); - #pragma omp parallel for private(n,k,ms,me,tmp,m,j,val) - #pragma acc parallel loop gang vector vector_length(32) private(n,k,ms,me,tmp,m,j,val) reduction(+:tmp) - for(n=0;nms){ - #pragma acc loop reduction(+:tmp) - for (k=ms;k0;i--) {IHt[i] = IHt[i-1];} IHt[0]=0; std::cout << "CostFunction3D::calcHmatrix: After construction of H^t" << std::endl; -#else - - I2H = new uint64_t [nnz]; // uint64_t - mPtr[0]=0; - for (n=1;n<=nState;n++){ - mPtr[n] = mPtr[n-1]+mIncr[n-1]; - } - for (n=0;n Date: Wed, 10 Jul 2024 15:40:28 -0600 Subject: [PATCH 11/14] Minor cleanup of the code. --- ncar_scripts/casper_h100_submit.sh | 38 +++++++++++++++++++++++++++++ ncar_scripts/derecho_a100_submit.sh | 10 ++++++-- src/CostFunction3D.cpp | 2 -- src/CostFunction3D.h | 1 - src/precision.h | 2 -- 5 files changed, 46 insertions(+), 7 deletions(-) create mode 100644 ncar_scripts/casper_h100_submit.sh diff --git a/ncar_scripts/casper_h100_submit.sh b/ncar_scripts/casper_h100_submit.sh new file mode 100644 index 0000000..cb2cbd0 --- /dev/null +++ b/ncar_scripts/casper_h100_submit.sh @@ -0,0 +1,38 @@ +#!/bin/bash -l +#PBS -N SAMURAI +#PBS -A NEOL0013 +#PBS -l select=1:ncpus=36:ompthreads=1:mem=700GB:ngpus=1 +#PBS -l gpu_type=h100 +#PBS -q casper +#PBS -l walltime=03:30:00 +#PBS -j oe +#PBS -k eod + +cd $PBS_O_WORKDIR +cd .. +export SAMURAI_ROOT=$(pwd) + +ID=`date '+%Y%m%d%H%M'` +################## +# Build the code # +################## +sed -i 's/cc70/cc90/g' CMakeLists.txt +sed -i 's/cc80/cc90/g' CMakeLists.txt + +cd ncar_scripts +./ncar_build.sh gpu + +############## +# Run a case # +############## +suffix="casper_gpu" +#for i in beltrami supercell hurricane typhoonChanthu2020 # hurricane_4panel +for i in hurricane_4panel +do + ./ncar_run.sh $SAMURAI_ROOT/ncar_scripts/TDRP/${i}.tdrp >& log_${i}_$suffix.$ID + if [ ! -d ${i}_${suffix} ]; then + mkdir ${i}_${suffix} + fi + mv $SAMURAI_ROOT/run/timing.0 ${i}_${suffix}/timing.$ID + mv $SAMURAI_ROOT/run/samurai* log* ${i}_${suffix} +done diff --git a/ncar_scripts/derecho_a100_submit.sh b/ncar_scripts/derecho_a100_submit.sh index d391140..4c94661 100644 --- a/ncar_scripts/derecho_a100_submit.sh +++ b/ncar_scripts/derecho_a100_submit.sh @@ -1,7 +1,7 @@ #!/bin/bash -l #PBS -N SAMURAI #PBS -A NEOL0013 -#PBS -l select=1:ncpus=64:ompthreads=1:mem=100GB:ngpus=1 +#PBS -l select=1:ncpus=64:ompthreads=1:mem=300GB:ngpus=1 #PBS -q main #PBS -l walltime=02:30:00 #PBS -j oe @@ -12,6 +12,10 @@ cd .. export SAMURAI_ROOT=$(pwd) ID=`date '+%Y%m%d%H%M'` + +sed -i 's/cc70/cc80/g' CMakeLists.txt +sed -i 's/cc90/cc80/g' CMakeLists.txt + ################## # Build the code # ################## @@ -22,7 +26,9 @@ cd ncar_scripts # Run a case # ############## suffix="derecho_gpu" -for i in beltrami supercell hurricane typhoonChanthu2020 # hurricane_4panel +#for i in supercell # hurricane_4panel +#for i in beltrami hurricane_4panel +for i in beltrami supercell hurricane typhoonChanthu2020 # hurricane_4panel do ./ncar_run.sh $SAMURAI_ROOT/ncar_scripts/TDRP/${i}.tdrp >& log_${i}_$suffix.$ID diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 02a4933..d08c2b6 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -2517,8 +2517,6 @@ void CostFunction3D::calcHmatrix() int iNode,jNode,kNode; int iiNode,jjNode,kkNode; int iis,iie,jjs,jje,kks,kke; - uint64_t *mTmp, *mIncr; // uint64_t - uint64_t dst; // uint64_t real ibasis,jbasis,kbasis; real weight; diff --git a/src/CostFunction3D.h b/src/CostFunction3D.h index cb2d73a..74ecf45 100644 --- a/src/CostFunction3D.h +++ b/src/CostFunction3D.h @@ -70,7 +70,6 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi bool SAtransform(const real* Bstate, real* Astate); bool SAtranspose(const real* Astate, real* Bstate); void calcInnovation(); - void calcHTransposeOLD(const real* yhat, real* Astate); void calcHTranspose(const real* yhat, real* Astate); virtual bool outputAnalysis(const std::string& suffix, real* Astate) = 0; void SBtransform(const real* Ustate, real* Bstate); diff --git a/src/precision.h b/src/precision.h index bfa14cb..3c32b55 100644 --- a/src/precision.h +++ b/src/precision.h @@ -13,7 +13,5 @@ #include typedef double real; -typedef unsigned long int integer; -//typedef unsigned int integer; #endif From 720e92cd46ad496a3bdd20fe58ba94f419b4e491 Mon Sep 17 00:00:00 2001 From: johnmauff Date: Wed, 10 Jul 2024 15:47:27 -0600 Subject: [PATCH 12/14] Cleanup --- ncar_scripts/casper_h100_submit.sh | 2 +- ncar_scripts/derecho_a100_submit.sh | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ncar_scripts/casper_h100_submit.sh b/ncar_scripts/casper_h100_submit.sh index ebf17ed..403d0eb 100644 --- a/ncar_scripts/casper_h100_submit.sh +++ b/ncar_scripts/casper_h100_submit.sh @@ -4,7 +4,7 @@ #PBS -l select=1:ncpus=36:ompthreads=1:mem=700GB:ngpus=1 #PBS -l gpu_type=h100 #PBS -q casper -#PBS -l walltime=03:30:00 +#PBS -l walltime=05:00:00 #PBS -j oe #PBS -k eod diff --git a/ncar_scripts/derecho_a100_submit.sh b/ncar_scripts/derecho_a100_submit.sh index ff34744..f5ae92e 100644 --- a/ncar_scripts/derecho_a100_submit.sh +++ b/ncar_scripts/derecho_a100_submit.sh @@ -29,9 +29,7 @@ cd ncar_scripts # Run a case # ############## suffix="derecho_gpu" -#for i in supercell # hurricane_4panel -#for i in beltrami hurricane_4panel -for i in beltrami supercell hurricane typhoonChanthu2020 # hurricane_4panel +for i in beltrami supercell hurricane typhoonChanthu2020 # hurricane_4panel do ./ncar_run.sh $SAMURAI_ROOT/ncar_scripts/TDRP/${i}.tdrp >& log_${i}_$suffix.$ID From c4b683b19c900cfd2ce33117789fef1174df4cc3 Mon Sep 17 00:00:00 2001 From: johnmauff Date: Thu, 11 Jul 2024 08:37:02 -0600 Subject: [PATCH 13/14] Clarification on the size of int used in the code. --- src/CostFunction3D.cpp | 16 +++++++--------- src/CostFunction3D.h | 8 ++++---- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index d08c2b6..84b9cdb 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -806,8 +806,8 @@ void CostFunction3D::calcInnovation() void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) { - uint64_t n,m; // uint64_t - uint64_t j,begin,end; // uint32_t + uint64_t n,m; + uint64_t j,begin,end; real tmp,val; #pragma acc data present(yhat,Astate) @@ -2578,17 +2578,15 @@ void CostFunction3D::calcHmatrix() //GPTLstop("CostFunction3D::calcHmatrix:nnz"); IH[mObs] = nnz; - std::cout << "CostFunction3D:: nnz " << nnz << std::endl; + std::cout << "CostFunction3D:: nonzeros " << nnz << std::endl; std::cout << "Memory usage for [H] (Mbytes): " << sizeof(real)*(nnz)/(1024.0*1024.0) << std::endl; H = new real[nnz]; - JH = new uint32_t [nnz]; // uint32_t + JH = new uint32_t [nnz]; Ht = new real[nnz]; - JHt = new uint32_t [nnz]; // uint32_t + JHt = new uint32_t [nnz]; std::cout << "CostFunction3D::calcHmatrix: before big loop" << std::endl; hi=0; - //#pragma omp parallel for private(m,mi,i,j,k,ii,iis,iie,jj,jjs,jje,kk,kks,kke,ibasis,jbasis,kbasis,iiNode,jjNode,kkNode,iNode,jNode,kNode,var,d,wgt_index,weight,cIndex) //[8.1] - //#pragma acc parallel loop vector gang vector_length(32) private(m,mi,i,j,k,ii,iis,iie,jj,jjs,jje,kk,kks,kke,ibasis,jbasis,kbasis,iiNode,jjNode,kkNode,iNode,jNode,kNode,var,d,wgt_index,weight,cIndex) for (m = 0; m < mObs; m++) { IH[m]=hi; mi = m*(7+varDim*derivDim); @@ -2670,8 +2668,8 @@ void CostFunction3D::calcHmatrix() void CostFunction3D::Htransform(const real* Cstate, real* Hstate) { - uint64_t i; // uint32_t - uint64_t j,begin,end; // uint64_t + uint64_t i; + uint64_t j,begin,end; real tmp; #pragma acc data present(Cstate,Hstate) diff --git a/src/CostFunction3D.h b/src/CostFunction3D.h index 74ecf45..3d7ab52 100644 --- a/src/CostFunction3D.h +++ b/src/CostFunction3D.h @@ -154,13 +154,13 @@ CostFunction3D(const Projection& proj, const int& numObs = 0, const int& stateSi // explicitly store the H matrix in CSR format real *H; - uint64_t *IH; // uint64_t - uint32_t *JH; // uint32_t + uint64_t *IH; // Array with extent:(nState+1) can take on values [0 to nonzeros] + uint32_t *JH; // Array with extent:(nonzeros) can take on values [0 to nState-1] // explicity store the H^t matrix in CSR format real *Ht; - uint64_t *IHt; // uint64_t - uint32_t *JHt; // uint32_t + uint64_t *IHt; // Array with extent(mObs+1_ can take on values [0 to nonzeros] + uint32_t *JHt; // Array with extent(nonzeros) can take on values [0 to mObs-1] int basisappx; real* basis0; From 2e5fb52703bdd0cc70645324b7a6483f56911bd5 Mon Sep 17 00:00:00 2001 From: johnmauff Date: Thu, 11 Jul 2024 09:32:40 -0600 Subject: [PATCH 14/14] Fixed OpenACC directive --- src/CostFunction3D.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/CostFunction3D.cpp b/src/CostFunction3D.cpp index 84b9cdb..6ecaccc 100644 --- a/src/CostFunction3D.cpp +++ b/src/CostFunction3D.cpp @@ -819,7 +819,6 @@ void CostFunction3D::calcHTranspose(const real* yhat, real* Astate) for(n=0; n