From 969d18d7fec6d138bdcaca287d5d40a3af3886ba Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 12 Jun 2014 10:33:20 -0500 Subject: [PATCH 001/110] modified axpy:localtoglobal code to mimic nblocking consensus as shown in alg2 of T.Hoefler DSDE paper --- include/El/core/AxpyInterface.hpp | 10 +- include/El/core/imports/mpi.hpp | 12 + src/core/AxpyInterface.cpp | 1447 +++++++++++++++-------------- src/core/imports/mpi.cpp | 16 + 4 files changed, 775 insertions(+), 710 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index d893f6df9d..1a1c72b7da 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -28,7 +28,7 @@ class AxpyInterface public: AxpyInterface(); ~AxpyInterface(); - + AxpyInterface( AxpyType type, DistMatrix& Z ); AxpyInterface( AxpyType type, const DistMatrix& Z ); @@ -46,6 +46,14 @@ class AxpyInterface EOM_TAG =2, DATA_REQUEST_TAG=3, DATA_REPLY_TAG =4; + +//request object for polling on Issends +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) + bool DONE; + mpi::Request nb_bar_request; + bool nb_bar_active; + bool all_sends_are_finished; +#endif bool attachedForLocalToGlobal_, attachedForGlobalToLocal_; byte sendDummy_, recvDummy_; diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index b6748d9ba1..ef7e82a9e5 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -29,6 +29,11 @@ namespace mpi { #endif #endif +//Use MPI-3 IBarrier instead of El strict EOM matching +#ifndef EL_USE_IBARRIER +#define EL_USE_IBARRIER +#endif + struct Comm { MPI_Comm comm; @@ -71,6 +76,7 @@ typedef MPI_User_function UserFunction; // Standard constants const int ANY_SOURCE = MPI_ANY_SOURCE; const int ANY_TAG = MPI_ANY_TAG; +const int ERR_RANK = MPI_ERR_RANK; #ifdef EL_HAVE_MPI_QUERY_THREAD const int THREAD_SINGLE = MPI_THREAD_SINGLE; const int THREAD_FUNNELED = MPI_THREAD_FUNNELED; @@ -167,11 +173,17 @@ void Translate // Utilities void Barrier( Comm comm ); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +void IBarrier( Comm comm, Request& request ); +#endif void Wait( Request& request ); void Wait( Request& request, Status& status ); +//TODO add another function for getting statuses +void WaitAny( int count, Request *requests, int *indx ); void WaitAll( int numRequests, Request* requests ); void WaitAll( int numRequests, Request* requests, Status* statuses ); bool Test( Request& request ); +bool Test( Request& request, Status& status ); bool IProbe( int source, int tag, Comm comm, Status& status ); template diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index b4524a12d3..c477f0533b 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -13,747 +13,776 @@ */ #include "El-lite.hpp" -namespace El { - -template -bool AxpyInterface::Finished() +namespace El { - DEBUG_ONLY( - CallStackEntry cse("AxpyInterface::Finished"); - if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) - LogicError("Not attached"); - ) - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : - globalToLocalMat_->Grid() ); - const Int p = g.Size(); - - bool finished = true; - for( Int rank=0; rank bool AxpyInterface < T >::Finished () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Finished"); + if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) + LogicError ("Not attached");) + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + const Int p = g.Size (); + + bool finished = true; + for (Int rank = 0; rank < p; ++rank) + { + if (!sentEomTo_[rank] || !haveEomFrom_[rank]) + { + finished = false; + break; + } + } return finished; -} + } -template -void AxpyInterface::HandleEoms() -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::HandleEoms")) - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : - globalToLocalMat_->Grid() ); - const Int p = g.Size(); + template < typename T > void AxpyInterface < T >::HandleEoms () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleEoms")) + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + const Int p = g.Size (); - UpdateRequestStatuses(); + UpdateRequestStatuses (); // Try to progress our EOM sends - for( Int i=0; i -void AxpyInterface::HandleLocalToGlobalData() -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::HandleLocalToGlobalData")) - DistMatrix& Y = *localToGlobalMat_; - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int myRow = g.Row(); - const Int myCol = g.Col(); - + if (mpi::IProbe (mpi::ANY_SOURCE, EOM_TAG, g.VCComm (), status)) + { + const Int source = status.MPI_SOURCE; + mpi::TaggedRecv (&recvDummy_, 1, source, EOM_TAG, g.VCComm ()); + haveEomFrom_[source] = true; + } + } + + template < typename T > void AxpyInterface < T >::HandleLocalToGlobalData () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) + DistMatrix < T > &Y = *localToGlobalMat_; + const Grid & g = Y.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); mpi::Status status; - if( mpi::IProbe( mpi::ANY_SOURCE, DATA_TAG, g.VCComm(), status ) ) - { - // Message exists, so recv and pack - const Int count = mpi::GetCount( status ); - DEBUG_ONLY( - if( count < Int(4*sizeof(Int)+sizeof(T)) ) - LogicError("Count was too small"); - ) - const Int source = status.MPI_SOURCE; - recvVector_.resize( count ); - byte* recvBuffer = recvVector_.data(); - mpi::TaggedRecv( recvBuffer, count, source, DATA_TAG, g.VCComm() ); - - // Extract the header - byte* head = recvBuffer; - const Int i = *reinterpret_cast(head); - head += sizeof(Int); - const Int j = *reinterpret_cast(head); - head += sizeof(Int); - const Int height = *reinterpret_cast(head); - head += sizeof(Int); - const Int width = *reinterpret_cast(head); - head += sizeof(Int); - const T alpha = *reinterpret_cast(head); - head += sizeof(T); - DEBUG_ONLY( - if( height < 0 || width < 0 ) - RuntimeError - ("Unpacked heights were negative:\n", - " i= ",i,std::hex,"(",i,")\n",std::dec, - " j= ",j,std::hex,"(",j,")\n",std::dec, - " height=",height,std::hex,"(",height,")\n",std::dec, - " width= ",width,std::hex,"(",width,")\n",std::dec, - " alpha= ",alpha); - if( i < 0 || j < 0 ) - RuntimeError - ("Unpacked offsets were negative:\n", - " i= ",i,std::hex,"(",i,")\n",std::dec, - " j= ",j,std::hex,"(",j,")\n",std::dec, - " height=",height,std::hex,"(",height,")\n",std::dec, - " width= ",width,std::hex,"(",width,")\n",std::dec, - " alpha= ",alpha); - if( i+height > Y.Height() || j+width > Y.Width() ) - RuntimeError - ("Unpacked submatrix was out of bounds:\n", - " i= ",i,std::hex,"(",i,")\n",std::dec, - " j= ",j,std::hex,"(",j,")\n",std::dec, - " height=",height,std::hex,"(",height,")\n",std::dec, - " width= ",width,std::hex,"(",width,")\n",std::dec, - " alpha= ",alpha); - ) - - // Update Y - const T* XBuffer = reinterpret_cast(head); - const Int colAlign = (Y.ColAlign()+i) % r; - const Int rowAlign = (Y.RowAlign()+j) % c; - const Int colShift = Shift( myRow, colAlign, r ); - const Int rowShift = Shift( myCol, rowAlign, c ); - - const Int localHeight = Length( height, colShift, r ); - const Int localWidth = Length( width, rowShift, c ); - const Int iLocalOffset = Length( i, Y.ColShift(), r ); - const Int jLocalOffset = Length( j, Y.RowShift(), c ); - - for( Int t=0; t -void AxpyInterface::HandleGlobalToLocalRequest() -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::HandleGlobalToLocalRequest")) - const DistMatrix& X = *globalToLocalMat_; - const Grid& g = X.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int myRow = g.Row(); - const Int myCol = g.Col(); + + if (mpi::IProbe (mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status)) + { + // Message exists, so recv and pack + const Int count = mpi::GetCount < byte > (status); + DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) + LogicError ("Count was too small");) + const Int source = status.MPI_SOURCE; + recvVector_.resize (count); + byte *recvBuffer = recvVector_.data (); + mpi::TaggedRecv (recvBuffer, count, source, DATA_TAG, g.VCComm ()); + + // Extract the header + byte *head = recvBuffer; + const Int i = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int j = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int height = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int width = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const T alpha = *reinterpret_cast < const T * >(head); + head += sizeof (T); + DEBUG_ONLY (if (height < 0 || width < 0) + RuntimeError + ("Unpacked heights were negative:\n", + " i= ", i, std::hex, "(", i, ")\n", std::dec, + " j= ", j, std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", height, ")\n", + std::dec, " width= ", width, std::hex, "(", width, + ")\n", std::dec, " alpha= ", alpha); + if (i < 0 + || j < + 0) RuntimeError ("Unpacked offsets were negative:\n", + " i= ", i, std::hex, "(", i, + ")\n", std::dec, " j= ", j, + std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", + height, ")\n", std::dec, " width= ", + width, std::hex, "(", width, ")\n", + std::dec, " alpha= ", alpha); + if (i + height > Y.Height () + || j + width > + Y.Width ())RuntimeError + ("Unpacked submatrix was out of bounds:\n", " i= ", + i, std::hex, "(", i, ")\n", std::dec, " j= ", j, + std::hex, "(", j, ")\n", std::dec, " height=", height, + std::hex, "(", height, ")\n", std::dec, " width= ", + width, std::hex, "(", width, ")\n", std::dec, + " alpha= ", alpha);) + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(head); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += alpha * XCol[s]; + } + + // Free the memory for the recv buffer + recvVector_.clear (); + } + } + + template < typename T > + void AxpyInterface < T >::HandleGlobalToLocalRequest () + { + DEBUG_ONLY (CallStackEntry + cse ("AxpyInterface::HandleGlobalToLocalRequest")) const + DistMatrix < T > &X = *globalToLocalMat_; + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); mpi::Status status; - if( mpi::IProbe( mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm(), status ) ) - { - // Request exists, so recv - const Int source = status.MPI_SOURCE; - const Int recvSize = 4*sizeof(Int); - recvVector_.resize( recvSize ); - byte* recvBuffer = recvVector_.data(); - mpi::TaggedRecv - ( recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm() ); - - // Extract the header - const byte* recvHead = recvBuffer; - const Int i = *reinterpret_cast(recvHead); - recvHead += sizeof(Int); - const Int j = *reinterpret_cast(recvHead); - recvHead += sizeof(Int); - const Int height = *reinterpret_cast(recvHead); - recvHead += sizeof(Int); - const Int width = *reinterpret_cast(recvHead); - recvHead += sizeof(Int); - - const Int colAlign = (X.ColAlign()+i) % r; - const Int rowAlign = (X.RowAlign()+j) % c; - const Int colShift = Shift( myRow, colAlign, r ); - const Int rowShift = Shift( myCol, rowAlign, c ); - - const Int iLocalOffset = Length( i, X.ColShift(), r ); - const Int jLocalOffset = Length( j, X.RowShift(), c ); - const Int localHeight = Length( height, colShift, r ); - const Int localWidth = Length( width, rowShift, c ); - const Int numEntries = localHeight*localWidth; - - const Int bufferSize = 2*sizeof(Int) + numEntries*sizeof(T); - const Int index = - ReadyForSend - ( bufferSize, replyVectors_[source], - replySendRequests_[source], sendingReply_[source] ); - - // Pack the reply header - byte* sendBuffer = replyVectors_[source][index].data(); - byte* sendHead = sendBuffer; - *reinterpret_cast(sendHead) = myRow; sendHead += sizeof(Int); - *reinterpret_cast(sendHead) = myCol; sendHead += sizeof(Int); - - // Pack the payload - T* sendData = reinterpret_cast(sendHead); - for( Int t=0; t -AxpyInterface::AxpyInterface() -: attachedForLocalToGlobal_(false), attachedForGlobalToLocal_(false), - localToGlobalMat_(0), globalToLocalMat_(0) -{ } - -template -AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) - if( type == LOCAL_TO_GLOBAL ) - { - attachedForLocalToGlobal_ = true; - attachedForGlobalToLocal_ = false; - localToGlobalMat_ = &Z; - globalToLocalMat_ = 0; - } + if (mpi::IProbe (mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm (), status)) + { + // Request exists, so recv + const Int source = status.MPI_SOURCE; + const Int recvSize = 4 * sizeof (Int); + recvVector_.resize (recvSize); + byte *recvBuffer = recvVector_.data (); + mpi::TaggedRecv + (recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm ()); + + // Extract the header + const byte *recvHead = recvBuffer; + const Int i = *reinterpret_cast < const Int * >(recvHead); + recvHead += sizeof (Int); + const Int j = *reinterpret_cast < const Int * >(recvHead); + recvHead += sizeof (Int); + const Int height = *reinterpret_cast < const Int * >(recvHead); + recvHead += sizeof (Int); + const Int width = *reinterpret_cast < const Int * >(recvHead); + recvHead += sizeof (Int); + + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int numEntries = localHeight * localWidth; + + const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); + const Int index = ReadyForSend (bufferSize, replyVectors_[source], + replySendRequests_[source], + sendingReply_[source]); + + // Pack the reply header + byte *sendBuffer = replyVectors_[source][index].data (); + byte *sendHead = sendBuffer; + *reinterpret_cast < Int * >(sendHead) = myRow; + sendHead += sizeof (Int); + *reinterpret_cast < Int * >(sendHead) = myCol; + sendHead += sizeof (Int); + + // Pack the payload + T *sendData = reinterpret_cast < T * >(sendHead); + for (Int t = 0; t < localWidth; ++t) + { + T *sendCol = &sendData[t * localHeight]; + const T *XCol = X.LockedBuffer (iLocalOffset, jLocalOffset + t); + MemCopy (sendCol, XCol, localHeight); + } + + // Fire off non-blocking send + mpi::TaggedISSend + (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), + replySendRequests_[source][index]); + } + } + +template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalToGlobal_ (false), attachedForGlobalToLocal_ (false), + localToGlobalMat_ (0), + globalToLocalMat_ (0) + { + } + + template < typename T > + AxpyInterface < T >::AxpyInterface (AxpyType type, DistMatrix < T > &Z) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyInterface")) + if (type == LOCAL_TO_GLOBAL) + { + attachedForLocalToGlobal_ = true; + attachedForGlobalToLocal_ = false; + localToGlobalMat_ = &Z; + globalToLocalMat_ = 0; + } else - { - attachedForLocalToGlobal_ = false; - attachedForGlobalToLocal_ = true; - localToGlobalMat_ = 0; - globalToLocalMat_ = &Z; - } - - const Int p = Z.Grid().Size(); - sentEomTo_.resize( p, false ); - haveEomFrom_.resize( p, false ); - - sendingData_.resize( p ); - sendingRequest_.resize( p ); - sendingReply_.resize( p ); - - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); - - dataSendRequests_.resize( p ); - requestSendRequests_.resize( p ); - replySendRequests_.resize( p ); - - eomSendRequests_.resize( p ); -} - -template -AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) - if( type == LOCAL_TO_GLOBAL ) - { - LogicError("Cannot update a constant matrix"); - } + { + attachedForLocalToGlobal_ = false; + attachedForGlobalToLocal_ = true; + localToGlobalMat_ = 0; + globalToLocalMat_ = &Z; + } + + const Int p = Z.Grid ().Size (); + sentEomTo_.resize (p, false); + haveEomFrom_.resize (p, false); + + sendingData_.resize (p); + sendingRequest_.resize (p); + sendingReply_.resize (p); + + dataVectors_.resize (p); + requestVectors_.resize (p); + replyVectors_.resize (p); + + dataSendRequests_.resize (p); + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + + eomSendRequests_.resize (p); + } + + template < typename T > + AxpyInterface < T >::AxpyInterface (AxpyType type, + const DistMatrix < T > &X) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyInterface")) + if (type == LOCAL_TO_GLOBAL) + { + LogicError ("Cannot update a constant matrix"); + } else - { - attachedForLocalToGlobal_ = false; - attachedForGlobalToLocal_ = true; - localToGlobalMat_ = 0; - globalToLocalMat_ = &X; - } - - const Int p = X.Grid().Size(); - sentEomTo_.resize( p, false ); - haveEomFrom_.resize( p, false ); - - sendingData_.resize( p ); - sendingRequest_.resize( p ); - sendingReply_.resize( p ); - - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); - - dataSendRequests_.resize( p ); - requestSendRequests_.resize( p ); - replySendRequests_.resize( p ); - - eomSendRequests_.resize( p ); -} - -template -AxpyInterface::~AxpyInterface() -{ - if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) - { - if( std::uncaught_exception() ) - { - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : - globalToLocalMat_->Grid() ); - std::ostringstream os; - os << g.Rank() - << "Uncaught exception detected during AxpyInterface destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << std::endl; - std::cerr << os.str(); - DEBUG_ONLY(DumpCallStack()) - } - else - { - Detach(); - } - } -} - -template -void AxpyInterface::Attach( AxpyType type, DistMatrix& Z ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Attach")) - if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) - LogicError("Must detach before reattaching."); - - if( type == LOCAL_TO_GLOBAL ) - { - attachedForLocalToGlobal_ = true; - localToGlobalMat_ = &Z; - } + { + attachedForLocalToGlobal_ = false; + attachedForGlobalToLocal_ = true; + localToGlobalMat_ = 0; + globalToLocalMat_ = &X; + } + + const Int p = X.Grid ().Size (); + sentEomTo_.resize (p, false); + haveEomFrom_.resize (p, false); + + sendingData_.resize (p); + sendingRequest_.resize (p); + sendingReply_.resize (p); + + dataVectors_.resize (p); + requestVectors_.resize (p); + replyVectors_.resize (p); + + dataSendRequests_.resize (p); + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + + eomSendRequests_.resize (p); + } + + template < typename T > AxpyInterface < T >::~AxpyInterface () + { + if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) + { + if (std::uncaught_exception ()) + { + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + std::ostringstream os; + os << g.Rank () + << + "Uncaught exception detected during AxpyInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str (); + DEBUG_ONLY (DumpCallStack ())} + else + { + Detach (); + } + } + } + + template < typename T > + void AxpyInterface < T >::Attach (AxpyType type, DistMatrix < T > &Z) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Attach")) + if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) + LogicError ("Must detach before reattaching."); + + if (type == LOCAL_TO_GLOBAL) + { + attachedForLocalToGlobal_ = true; + localToGlobalMat_ = &Z; + } else - { - attachedForGlobalToLocal_ = true; - globalToLocalMat_ = &Z; - } - - const Int p = Z.Grid().Size(); - sentEomTo_.resize( p, false ); - haveEomFrom_.resize( p, false ); - - sendingData_.resize( p ); - sendingRequest_.resize( p ); - sendingReply_.resize( p ); - - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); - - dataSendRequests_.resize( p ); - requestSendRequests_.resize( p ); - replySendRequests_.resize( p ); - - eomSendRequests_.resize( p ); -} - -template -void AxpyInterface::Attach( AxpyType type, const DistMatrix& X ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Attach")) - if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) - LogicError("Must detach before reattaching."); - - if( type == LOCAL_TO_GLOBAL ) - { - LogicError("Cannot update a constant matrix"); - } + { + attachedForGlobalToLocal_ = true; + globalToLocalMat_ = &Z; + } + + const Int p = Z.Grid ().Size (); + sentEomTo_.resize (p, false); + haveEomFrom_.resize (p, false); + + sendingData_.resize (p); + sendingRequest_.resize (p); + sendingReply_.resize (p); + + dataVectors_.resize (p); + requestVectors_.resize (p); + replyVectors_.resize (p); + + dataSendRequests_.resize (p); + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + + eomSendRequests_.resize (p); + } + + template < typename T > + void AxpyInterface < T >::Attach (AxpyType type, + const DistMatrix < T > &X) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Attach")) + if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) + LogicError ("Must detach before reattaching."); + + if (type == LOCAL_TO_GLOBAL) + { + LogicError ("Cannot update a constant matrix"); + } else - { - attachedForGlobalToLocal_ = true; - globalToLocalMat_ = &X; - } - - const Int p = X.Grid().Size(); - sentEomTo_.resize( p, false ); - haveEomFrom_.resize( p, false ); - - sendingData_.resize( p ); - sendingRequest_.resize( p ); - sendingReply_.resize( p ); - - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); - - dataSendRequests_.resize( p ); - requestSendRequests_.resize( p ); - replySendRequests_.resize( p ); - - eomSendRequests_.resize( p ); -} - -template -void AxpyInterface::Axpy( T alpha, Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Axpy")) - if( attachedForLocalToGlobal_ ) - AxpyLocalToGlobal( alpha, Z, i, j ); - else if( attachedForGlobalToLocal_ ) - AxpyGlobalToLocal( alpha, Z, i, j ); + { + attachedForGlobalToLocal_ = true; + globalToLocalMat_ = &X; + } + + const Int p = X.Grid ().Size (); + sentEomTo_.resize (p, false); + haveEomFrom_.resize (p, false); + + sendingData_.resize (p); + sendingRequest_.resize (p); + sendingReply_.resize (p); + + dataVectors_.resize (p); + requestVectors_.resize (p); + replyVectors_.resize (p); + + dataSendRequests_.resize (p); + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + + eomSendRequests_.resize (p); + } + + template < typename T > + void AxpyInterface < T >::Axpy (T alpha, Matrix < T > &Z, Int i, Int j) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Axpy")) + if (attachedForLocalToGlobal_) + AxpyLocalToGlobal (alpha, Z, i, j); + else if (attachedForGlobalToLocal_) + AxpyGlobalToLocal (alpha, Z, i, j); else - LogicError("Cannot axpy before attaching."); -} - -template -void AxpyInterface::Axpy( T alpha, const Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Axpy")) - if( attachedForLocalToGlobal_ ) - AxpyLocalToGlobal( alpha, Z, i, j ); - else if( attachedForGlobalToLocal_ ) - LogicError("Cannot update a constant matrix."); + LogicError ("Cannot axpy before attaching."); + } + + template < typename T > + void AxpyInterface < T >::Axpy (T alpha, const Matrix < T > &Z, Int i, + Int j) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Axpy")) + if (attachedForLocalToGlobal_) + AxpyLocalToGlobal (alpha, Z, i, j); + else if (attachedForGlobalToLocal_) + LogicError ("Cannot update a constant matrix."); else - LogicError("Cannot axpy before attaching."); -} + LogicError ("Cannot axpy before attaching."); + } // Update Y(i:i+height-1,j:j+width-1) += alpha X, where X is height x width -template -void AxpyInterface::AxpyLocalToGlobal -( T alpha, const Matrix& X, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyLocalToGlobal")) - DistMatrix& Y = *localToGlobalMat_; - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if( i+X.Height() > Y.Height() || j+X.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); - - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int height = X.Height(); - const Int width = X.Width(); + template < typename T > + void AxpyInterface < T >::AxpyLocalToGlobal + (T alpha, const Matrix < T > &X, Int i, Int j) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyLocalToGlobal")) + DistMatrix < T > &Y = *localToGlobalMat_; + if (i < 0 || j < 0) + LogicError ("Submatrix offsets must be non-negative"); + if (i + X.Height () > Y.Height () || j + X.Width () > Y.Width ()) + LogicError ("Submatrix out of bounds of global matrix"); + + all_sends_are_finished = false; + const Grid & g = Y.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + const Int myProcessRow = g.Row (); + const Int myProcessCol = g.Col (); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + + const Int height = X.Height (); + const Int width = X.Width (); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - for( Int step=0; step(head) = i; head += sizeof(Int); - *reinterpret_cast(head) = j; head += sizeof(Int); - *reinterpret_cast(head) = height; head += sizeof(Int); - *reinterpret_cast(head) = width; head += sizeof(Int); - *reinterpret_cast(head) = alpha; head += sizeof(T); - - // Pack the payload - T* sendData = reinterpret_cast(head); - const T* XBuffer = X.LockedBuffer(); - const Int XLDim = X.LDim(); - for( Int t=0; t(head) = i; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = j; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = height; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = width; + head += sizeof (Int); + *reinterpret_cast < T * >(head) = alpha; + head += sizeof (T); + + // Pack the payload + T *sendData = reinterpret_cast < T * >(head); + const T *XBuffer = X.LockedBuffer (); + const Int XLDim = X.LDim (); + for (Int t = 0; t < localWidth; ++t) + { + T *thisSendCol = &sendData[t * localHeight]; + const T *thisXCol = &XBuffer[(rowShift + t * c) * XLDim]; + for (Int s = 0; s < localHeight; ++s) + thisSendCol[s] = thisXCol[colShift + s * r]; + } + // Fire off the non-blocking send + mpi::TaggedISSend + (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), + dataSendRequests_[destination][index]); + } + + all_sends_are_finished = true; + receivingRow = (receivingRow + 1) % r; + if (receivingRow == 0) + receivingCol = (receivingCol + 1) % c; + } + } // Update Y += alpha X(i:i+height-1,j:j+width-1), where X is the dist-matrix -template -void AxpyInterface::AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyGlobalToLocal")) - const DistMatrix& X = *globalToLocalMat_; - - const Int height = Y.Height(); - const Int width = Y.Width(); - if( i+height > X.Height() || j+width > X.Width() ) - LogicError("Invalid AxpyGlobalToLocal submatrix"); - - const Grid& g = X.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); + template < typename T > + void AxpyInterface < T >::AxpyGlobalToLocal (T alpha, Matrix < T > &Y, + Int i, Int j) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyGlobalToLocal")) + const DistMatrix < T > &X = *globalToLocalMat_; + + const Int height = Y.Height (); + const Int width = Y.Width (); + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid AxpyGlobalToLocal submatrix"); + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); // Send out the requests to all processes in the grid - for( Int rank=0; rank(head) = i; head += sizeof(Int); - *reinterpret_cast(head) = j; head += sizeof(Int); - *reinterpret_cast(head) = height; head += sizeof(Int); - *reinterpret_cast(head) = width; head += sizeof(Int); - - // Begin the non-blocking send - mpi::TaggedISSend - ( sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm(), - requestSendRequests_[rank][index] ); - } + for (Int rank = 0; rank < p; ++rank) + { + const Int bufferSize = 4 * sizeof (Int); + const Int index = ReadyForSend (bufferSize, requestVectors_[rank], + requestSendRequests_[rank], + sendingRequest_[rank]); + + // Copy the request header into the send buffer + byte *sendBuffer = requestVectors_[rank][index].data (); + byte *head = sendBuffer; + *reinterpret_cast < Int * >(head) = i; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = j; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = height; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = width; + head += sizeof (Int); + + // Begin the non-blocking send + mpi::TaggedISSend + (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), + requestSendRequests_[rank][index]); + } // Receive all of the replies Int numReplies = 0; - while( numReplies < p ) - { - HandleGlobalToLocalRequest(); - - mpi::Status status; - if( mpi::IProbe( mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm(), status ) ) - { - const Int source = status.MPI_SOURCE; - - // Ensure that we have a recv buffer - const Int count = mpi::GetCount( status ); - recvVector_.resize( count ); - byte* recvBuffer = recvVector_.data(); - - // Receive the data - mpi::TaggedRecv - ( recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm() ); - - // Unpack the reply header - const byte* head = recvBuffer; - const Int row = *reinterpret_cast(head); - head += sizeof(Int); - const Int col = *reinterpret_cast(head); - head += sizeof(Int); - const T* recvData = reinterpret_cast(head); - - // Compute the local heights and offsets - const Int colAlign = (X.ColAlign()+i) % r; - const Int rowAlign = (X.RowAlign()+j) % c; - const Int colShift = Shift( row, colAlign, r ); - const Int rowShift = Shift( col, rowAlign, c ); - const Int localHeight = Length( height, colShift, r ); - const Int localWidth = Length( width, rowShift, c ); - - // Unpack the local matrix - for( Int t=0; t -Int AxpyInterface::ReadyForSend -( Int sendSize, - std::deque>& sendVectors, - std::deque& requests, - std::deque& requestStatuses ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::ReadyForSend")) - const Int numCreated = sendVectors.size(); - DEBUG_ONLY( - if( numCreated != Int(requests.size()) || - numCreated != Int(requestStatuses.size()) ) - LogicError("size mismatch"); - ) - for( Int i=0; i (status); + recvVector_.resize (count); + byte *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm ()); + + // Unpack the reply header + const byte *head = recvBuffer; + const Int row = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int col = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const T *recvData = reinterpret_cast < const T * >(head); + + // Compute the local heights and offsets + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (row, colAlign, r); + const Int rowShift = Shift (col, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (0, rowShift + t * c); + const T *XCol = &recvData[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] += alpha * XCol[s]; + } + + ++numReplies; + } + } + } + + template < typename T > + Int AxpyInterface < T >::ReadyForSend + (Int sendSize, + std::deque < std::vector < byte >> &sendVectors, + std::deque < mpi::Request > &requests, + std::deque < bool > &requestStatuses) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::ReadyForSend")) + const Int numCreated = sendVectors.size (); + DEBUG_ONLY (if (numCreated != Int (requests.size ()) || + numCreated != + Int (requestStatuses.size ()))LogicError + ("size mismatch");) + for (Int i = 0; i < numCreated; ++i) + { + // If this request is still running, test to see if it finished. + if (requestStatuses[i]) + { + const bool finished = mpi::Test (requests[i]); + requestStatuses[i] = !finished; + } + + if (!requestStatuses[i]) + { + requestStatuses[i] = true; + sendVectors[i].resize (sendSize); + return i; + } + } + + sendVectors.resize (numCreated + 1); + sendVectors[numCreated].resize (sendSize); + requests.push_back (mpi::REQUEST_NULL); + requestStatuses.push_back (true); return numCreated; -} + } + + template < typename T > void AxpyInterface < T >::UpdateRequestStatuses () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + const Int p = g.Size (); + + for (Int i = 0; i < p; ++i) + { + const Int numDataSendRequests = dataSendRequests_[i].size (); + for (Int j = 0; j < numDataSendRequests; ++j) + if (sendingData_[i][j]) + sendingData_[i][j] = !mpi::Test (dataSendRequests_[i][j]); + const Int numRequestSendRequests = requestSendRequests_[i].size (); + for (Int j = 0; j < numRequestSendRequests; ++j) + if (sendingRequest_[i][j]) + sendingRequest_[i][j] = !mpi::Test (requestSendRequests_[i][j]); + const Int numReplySendRequests = replySendRequests_[i].size (); + for (Int j = 0; j < numReplySendRequests; ++j) + if (sendingReply_[i][j]) + sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); + } + } + + template < typename T > void AxpyInterface < T >::Detach () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Detach")) + if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) + LogicError ("Must attach before detaching."); + + DONE = false; + nb_bar_active = false; + + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_-> + Grid () : globalToLocalMat_->Grid ()); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) + while (!DONE) +#else + while (!Finished ()) +#endif + { + if (attachedForLocalToGlobal_) + HandleLocalToGlobalData (); + else + HandleGlobalToLocalRequest (); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) + if (nb_bar_active) + { + // test for IBarrier completion + DONE = mpi::Test (nb_bar_request); + } + else + { + if (all_sends_are_finished) + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier (g.VCComm (), nb_bar_request); + nb_bar_active = true; + } + } +#else + HandleEoms (); +#endif + } + +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#else + mpi::Barrier (g.VCComm ()); +#endif -template -void AxpyInterface::UpdateRequestStatuses() -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::UpdateRequestStatuses")) - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : - globalToLocalMat_->Grid() ); - const Int p = g.Size(); - - for( Int i=0; i -void AxpyInterface::Detach() -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Detach")) - if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) - LogicError("Must attach before detaching."); + attachedForLocalToGlobal_ = false; + attachedForGlobalToLocal_ = false; + recvVector_.clear (); - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : - globalToLocalMat_->Grid() ); + sentEomTo_.clear (); + haveEomFrom_.clear (); - while( !Finished() ) - { - if( attachedForLocalToGlobal_ ) - HandleLocalToGlobalData(); - else - HandleGlobalToLocalRequest(); - HandleEoms(); - } + sendingData_.clear (); + sendingRequest_.clear (); + sendingReply_.clear (); - mpi::Barrier( g.VCComm() ); + dataVectors_.clear (); + requestVectors_.clear (); + replyVectors_.clear (); - attachedForLocalToGlobal_ = false; - attachedForGlobalToLocal_ = false; - recvVector_.clear(); - sentEomTo_.clear(); - haveEomFrom_.clear(); - - sendingData_.clear(); - sendingRequest_.clear(); - sendingReply_.clear(); - - dataVectors_.clear(); - requestVectors_.clear(); - replyVectors_.clear(); - - dataSendRequests_.clear(); - requestSendRequests_.clear(); - replySendRequests_.clear(); - - eomSendRequests_.clear(); -} - -template class AxpyInterface; -template class AxpyInterface; -template class AxpyInterface; -template class AxpyInterface>; -template class AxpyInterface>; - -} // namespace El + dataSendRequests_.clear (); + requestSendRequests_.clear (); + replySendRequests_.clear (); + + eomSendRequests_.clear (); + } + + template class AxpyInterface < Int >; + template class AxpyInterface < float >; + template class AxpyInterface < double >; + template class AxpyInterface < Complex < float >>; + template class AxpyInterface < Complex < double >>; + +} // namespace El diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index c7de57712f..ae6964e22e 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -358,6 +358,14 @@ void Barrier( Comm comm ) SafeMpi( MPI_Barrier( comm.comm ) ); } +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +void IBarrier( Comm comm, Request& request ) +{ + DEBUG_ONLY(CallStackEntry cse("mpi::IBarrier")) + SafeMpi( MPI_Ibarrier( comm.comm, &request ) ); +} +#endif + // Test for completion bool Test( Request& request ) { @@ -368,6 +376,14 @@ bool Test( Request& request ) return flag; } +bool Test( Request& request, Status& status ) +{ + DEBUG_ONLY(CallStackEntry cse("mpi::Test")) + int flag; + SafeMpi( MPI_Test( &request, &flag, &status ) ); + return flag; +} + // Ensure that the request finishes before continuing void Wait( Request& request ) { From b8e2e8dd06a216c09ffeca109b9cf989f6a5741e Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 12 Jun 2014 11:08:45 -0500 Subject: [PATCH 002/110] forgot to add mpi-3 macros around a variable --- src/core/AxpyInterface.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index c477f0533b..ecb1c8b1fb 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -478,7 +478,9 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo if (i + X.Height () > Y.Height () || j + X.Width () > Y.Width ()) LogicError ("Submatrix out of bounds of global matrix"); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) all_sends_are_finished = false; +#endif const Grid & g = Y.Grid (); const Int r = g.Height (); const Int c = g.Width (); @@ -545,7 +547,9 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo dataSendRequests_[destination][index]); } +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) all_sends_are_finished = true; +#endif receivingRow = (receivingRow + 1) % r; if (receivingRow == 0) receivingCol = (receivingCol + 1) % c; From 61071e7aee560b78c55589d5bc5702da48665d87 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 12 Jun 2014 18:53:33 -0500 Subject: [PATCH 003/110] preposting receives, alternative to iprobe --- src/core/AxpyInterface.cpp | 98 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 5 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index ecb1c8b1fb..d6a91b1980 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -110,6 +110,51 @@ namespace El const Int myCol = g.Col (); mpi::Status status; +#ifdef EL_PREPOST_MESSAGES + mpi::Status prepost_status; + int indx = -1; + int flag; + bool comp = false; + // one of the recvs that was preposted in Attach is expected to complete... + comp = + mpi::Testany (max_preposted_messages, prepost_reqs, indx, + prepost_status); + // make sure at least one message is received + while (!comp) + { + comp = + mpi::Testany (max_preposted_messages, prepost_reqs, indx, + prepost_status); + } + // data received + //TODO How to resize a large to smaller buffer? + if (prepost_status.MPI_TAG == SMALL_DATA_TAG) + { + // resize buffer + const Int count = mpi::GetCount < byte > (prepost_status); + recvVector_.resize (count); + recvBuffer = recvVector_.data (); + + } + else if (prepost_status.MPI_TAG == MORE_DATA_TAG) + { + // only first half is in receive buffer, prepare another receive + const Int count = mpi::GetCount < byte > (prepost_status); + DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) + LogicError ("Count was too small");) + const Int source = prepost_status.MPI_SOURCE; + + recvVector_.resize (count + PREPOST_THRESHOLD); + recvBuffer = recvVector_.data (); + + mpi::TaggedRecv (recvBuffer + PREPOST_THRESHOLD, count, source, + DATA_TAG, g.VCComm ()); + } + else + { + return; + } +#else if (mpi::IProbe (mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status)) { // Message exists, so recv and pack @@ -120,7 +165,7 @@ namespace El recvVector_.resize (count); byte *recvBuffer = recvVector_.data (); mpi::TaggedRecv (recvBuffer, count, source, DATA_TAG, g.VCComm ()); - +#endif // Extract the header byte *head = recvBuffer; const Int i = *reinterpret_cast < const Int * >(head); @@ -181,9 +226,12 @@ namespace El YCol[s] += alpha * XCol[s]; } +#ifdef EL_PREPOST_MESSAGES +#else // Free the memory for the recv buffer recvVector_.clear (); } +#endif } template < typename T > @@ -378,6 +426,21 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo { attachedForLocalToGlobal_ = true; localToGlobalMat_ = &Z; +//TODO Is this the right place to prepost? +#ifdef EL_PREPOST_MESSAGES +//TODO Make max-small-messages configurable, can we calculate an upper bound? + max_preposted_messages = Z.Grid ().Size () * 100; //function of the grid size + prepost_reqs = new mpi::Request[max_preposted_messages]; + + for (int i = 0; i < max_preposted_messages; i++) + { + //resize to default prepost size + recvVector_.resize (PREPOST_THRESHOLD); + recvBuffer = recvVector_.data (); + mpi::TaggedIRecv (recvBuffer, PREPOST_THRESHOLD, mpi::ANY_SOURCE, + mpi::ANY_TAG, g.VCComm (), &prepost_reqs[i]); + } +#endif } else { @@ -542,11 +605,31 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo thisSendCol[s] = thisXCol[colShift + s * r]; } // Fire off the non-blocking send +#ifdef EL_PREPOST_MESSAGES + if (bufferSize <= PREPOST_THRESHOLD) + { + mpi::TaggedISSend + (sendBuffer, bufferSize, destination, SMALL_DATA_TAG, + g.VCComm (), dataSendRequests_[destination][index]); + } + else + { + //SMALL_DATA_TAG + mpi::TaggedISSend + (sendBuffer, PREPOST_THRESHOLD, destination, MORE_DATA_TAG, + g.VCComm (), dataSendRequests_[destination][index]); + //remaining data using MORE_DATA_TAG + mpi::TaggedISSend + (sendBuffer + PREPOST_THRESHOLD, + bufferSize - PREPOST_THRESHOLD, destination, DATA_TAG, + g.VCComm (), dataSendRequests_[destination][index]); + } +#else mpi::TaggedISSend (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), dataSendRequests_[destination][index]); +#endif } - #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) all_sends_are_finished = true; #endif @@ -720,12 +803,13 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) LogicError ("Must attach before detaching."); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) DONE = false; nb_bar_active = false; - +#endif const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_-> - Grid () : globalToLocalMat_->Grid ()); + localToGlobalMat_->Grid () : globalToLocalMat_-> + Grid ()); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) while (!DONE) #else @@ -781,6 +865,10 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo replySendRequests_.clear (); eomSendRequests_.clear (); + +#ifdef EL_PREPOST_MESSAGES + delete[]prepost_reqs; +#endif } template class AxpyInterface < Int >; From 2d52d96c9986b9324552b27024b88b89e29cca9a Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 17 Jun 2014 11:12:04 -0500 Subject: [PATCH 004/110] contains code to avoid probe overhead for short messages, Jeffs rma interface code sync from his forked repo and defined EL_MPI_EXPERIMENTAL macro --- include/El/core/RmaInterface.hpp | 50 ++++++++++++++ src/core/RmaInterface.cpp | 113 +++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 include/El/core/RmaInterface.hpp create mode 100644 src/core/RmaInterface.cpp diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp new file mode 100644 index 0000000000..9534a16d7e --- /dev/null +++ b/include/El/core/RmaInterface.hpp @@ -0,0 +1,50 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Jeff Hammond (Intel) + All rights reserved. + + Authors: + Jeff Hammond adapted the RMA interface from the AXPY one. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at + http://opensource.org/licenses/BSD-2-Clause +*/ +#pragma once +#ifndef EL_RMAINTERFACE_HPP +#define EL_RMAINTERFACE_HPP + +namespace El { + +template +class RmaInterface +{ +public: + RmaInterface(); + ~RmaInterface(); + + RmaInterface( DistMatrix& Z ); + RmaInterface( const DistMatrix& Z ); + + void Attach( DistMatrix& Z ); + void Attach( const DistMatrix& Z ); + + void Put( Matrix& Z, Int i, Int j ); + void Put( const Matrix& Z, Int i, Int j ); + + void Get( Matrix& Z, Int i, Int j ); + void Get( const Matrix& Z, Int i, Int j ); + + void Acc( Matrix& Z, Int i, Int j ); + void Acc( const Matrix& Z, Int i, Int j ); + + void Detach(); + +private: + +}; + +} // namespace El + +#endif // ifndef EL_RMAINTERFACE_HPP diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp new file mode 100644 index 0000000000..3b6b8938af --- /dev/null +++ b/src/core/RmaInterface.cpp @@ -0,0 +1,113 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Jeff Hammond (Intel) + All rights reserved. + + Authors: + Jeff Hammond adapted the RMA interface from the AXPY one. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at + http://opensource.org/licenses/BSD-2-Clause +*/ +#include "El-lite.hpp" + +namespace El { + +template +RmaInterface::RmaInterface( DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) +} + +template +RmaInterface::RmaInterface( const DistMatrix& X ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) +} + +template +RmaInterface::~RmaInterface() +{ + { + if( std::uncaught_exception() ) + { + std::ostringstream os; + os << "Uncaught exception detected during RmaInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY(DumpCallStack()) + } + else + { + Detach(); + } + } +} + +template +void RmaInterface::Attach( DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) +} + +template +void RmaInterface::Attach( const DistMatrix& X ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) +} + +template +void RmaInterface::Put( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) +} + +template +void RmaInterface::Put( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) +} + +template +void RmaInterface::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) +} + +template +void RmaInterface::Get( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) +} + +template +void RmaInterface::Acc( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) +} + +template +void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) +} + +template +void RmaInterface::Detach() +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) +} + +template class RmaInterface; +template class RmaInterface; +template class RmaInterface; +template class RmaInterface>; +template class RmaInterface>; + +} // namespace El From 28faf2567aa6f3bd6d928b74b2698d96eb0a2403 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 25 Jun 2014 23:01:55 -0500 Subject: [PATCH 005/110] committing after a week, removed prepost related stuff because of poor performance; added non-blocking consensus stuff and dummy rma code, code in rmainterface is just some cut-paste, would work on it actively --- include/El/core.hpp | 1 + include/El/core/AxpyInterface.hpp | 8 +- include/El/core/RmaInterface.hpp | 18 +- include/El/core/imports/mpi.hpp | 48 +- src/core/AxpyInterface.cpp | 182 +- src/core/RmaInterface.cpp | 221 +- src/core/imports/mpi.cpp | 5473 ++++++++++++++++++----------- 7 files changed, 3786 insertions(+), 2165 deletions(-) diff --git a/include/El/core.hpp b/include/El/core.hpp index 6c2d961ebc..a73e9c404e 100644 --- a/include/El/core.hpp +++ b/include/El/core.hpp @@ -148,5 +148,6 @@ template class BlockDistMatrix; #include "El/core/random/decl.hpp" #include "El/core/random/impl.hpp" #include "El/core/AxpyInterface.hpp" +#include "El/core/RmaInterface.hpp" #endif // ifndef EL_CORE_HPP diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index 1a1c72b7da..34caad7f4b 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -45,18 +45,16 @@ class AxpyInterface DATA_TAG =1, EOM_TAG =2, DATA_REQUEST_TAG=3, - DATA_REPLY_TAG =4; + DATA_REPLY_TAG =4 + ; //request object for polling on Issends #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) - bool DONE; - mpi::Request nb_bar_request; - bool nb_bar_active; bool all_sends_are_finished; #endif - bool attachedForLocalToGlobal_, attachedForGlobalToLocal_; byte sendDummy_, recvDummy_; + DistMatrix* localToGlobalMat_; const DistMatrix* globalToLocalMat_; diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 9534a16d7e..c8ab37fe87 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -15,6 +15,7 @@ #ifndef EL_RMAINTERFACE_HPP #define EL_RMAINTERFACE_HPP +#if MPI_VERSION>=3 namespace El { template @@ -30,21 +31,26 @@ class RmaInterface void Attach( DistMatrix& Z ); void Attach( const DistMatrix& Z ); - void Put( Matrix& Z, Int i, Int j ); - void Put( const Matrix& Z, Int i, Int j ); + void Put( T alpha, Matrix& Z, Int i, Int j ); + void Put( T alpha, const Matrix& Z, Int i, Int j ); void Get( Matrix& Z, Int i, Int j ); void Get( const Matrix& Z, Int i, Int j ); - void Acc( Matrix& Z, Int i, Int j ); - void Acc( const Matrix& Z, Int i, Int j ); + void Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ); + void Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ); + + void Flush( const Matrix& Z, Int i, Int j); void Detach(); private: - + mpi::Window window; + std::vector getVector_, putVector_; + DistMatrix* localToGlobalMat_; + const DistMatrix* globalToLocalMat_; }; } // namespace El - +#endif #endif // ifndef EL_RMAINTERFACE_HPP diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index ef7e82a9e5..db35336503 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -34,6 +34,11 @@ namespace mpi { #define EL_USE_IBARRIER #endif +//Experimental MPI performance enhancers +#ifndef EL_MPI_EXPERIMENTAL +#define EL_MPI_EXPERIMENTAL +#endif + struct Comm { MPI_Comm comm; @@ -72,7 +77,19 @@ typedef MPI_Errhandler ErrorHandler; typedef MPI_Request Request; typedef MPI_Status Status; typedef MPI_User_function UserFunction; - +#if MPI_VERSION >= 3 +typedef MPI_Win Window; +typedef enum +{ + STRICT_ACC_ORDERING = 0, + PARTIAL_ACC_ORDERING = 2, + NO_ACC_ORDERING = 4 +} acc_order_t; +//TODO update these +const int MAX_OUTSTANDING_NB = 100000; +const int FLUSH_FREQUENCY = 10000; +#endif +typedef MPI_Info Info; // Standard constants const int ANY_SOURCE = MPI_ANY_SOURCE; const int ANY_TAG = MPI_ANY_TAG; @@ -171,6 +188,32 @@ void Translate ( Comm origComm, int size, const int* origRanks, Comm newComm, int* newRanks ); +//MPI-3 one-sided +#if MPI_VERSION>=3 +void SetWindowProp (Window& window, int prop); +//NOTE assuming MPI_MODE_NOCHECK +void WindowLock( int rank, Window& window ); +void WindowLock( Window& window ); +void WindowUnlock( int rank, Window& window ); +void WindowUnlock( Window& window ); +void WindowCreate( int size, Comm comm, Window& window ); +void WindowCreate( int size, Info info, Comm comm, Window& window ); +void Iput( void *source, int source_size, int target_rank, + int target_size, Window& window); +void Rput( void *source, int source_size, int target_rank, int target_size, + Window& window, Request& request); +void Iget( void *source, int source_size, int target_rank, + int target_size, Window& window); +void Rget( void *source, int source_size, int target_rank, int target_size, + Window& window, Request& request); +void Iacc( void *source, int source_size, int target_rank, + int target_size, Op &op, Window& window); +void Racc( void *source, int source_size, int target_rank, int target_size, + Op &op, Window& window, Request& request); +void Flush( int target_rank, Window& window, bool isLocalCompletion ); +void Flush( Window& window, bool isLocalCompletion ); +#endif + // Utilities void Barrier( Comm comm ); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) @@ -184,6 +227,9 @@ void WaitAll( int numRequests, Request* requests ); void WaitAll( int numRequests, Request* requests, Status* statuses ); bool Test( Request& request ); bool Test( Request& request, Status& status ); +bool Testany( int count, Request* requests ); +bool Testany( int count, Request* requests, int& indx ); +bool Testany( int count, Request* requests, int& indx, Status& status ); bool IProbe( int source, int tag, Comm comm, Status& status ); template diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index d6a91b1980..ab32554db5 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -110,53 +110,11 @@ namespace El const Int myCol = g.Col (); mpi::Status status; -#ifdef EL_PREPOST_MESSAGES - mpi::Status prepost_status; - int indx = -1; - int flag; - bool comp = false; - // one of the recvs that was preposted in Attach is expected to complete... - comp = - mpi::Testany (max_preposted_messages, prepost_reqs, indx, - prepost_status); - // make sure at least one message is received - while (!comp) - { - comp = - mpi::Testany (max_preposted_messages, prepost_reqs, indx, - prepost_status); - } - // data received - //TODO How to resize a large to smaller buffer? - if (prepost_status.MPI_TAG == SMALL_DATA_TAG) - { - // resize buffer - const Int count = mpi::GetCount < byte > (prepost_status); - recvVector_.resize (count); - recvBuffer = recvVector_.data (); - - } - else if (prepost_status.MPI_TAG == MORE_DATA_TAG) - { - // only first half is in receive buffer, prepare another receive - const Int count = mpi::GetCount < byte > (prepost_status); - DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) - LogicError ("Count was too small");) - const Int source = prepost_status.MPI_SOURCE; - - recvVector_.resize (count + PREPOST_THRESHOLD); - recvBuffer = recvVector_.data (); - - mpi::TaggedRecv (recvBuffer + PREPOST_THRESHOLD, count, source, - DATA_TAG, g.VCComm ()); - } - else - { - return; - } -#else if (mpi::IProbe (mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status)) { +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) + all_sends_are_finished = true; +#endif // Message exists, so recv and pack const Int count = mpi::GetCount < byte > (status); DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) @@ -165,7 +123,6 @@ namespace El recvVector_.resize (count); byte *recvBuffer = recvVector_.data (); mpi::TaggedRecv (recvBuffer, count, source, DATA_TAG, g.VCComm ()); -#endif // Extract the header byte *head = recvBuffer; const Int i = *reinterpret_cast < const Int * >(head); @@ -225,13 +182,9 @@ namespace El for (Int s = 0; s < localHeight; ++s) YCol[s] += alpha * XCol[s]; } - -#ifdef EL_PREPOST_MESSAGES -#else // Free the memory for the recv buffer recvVector_.clear (); } -#endif } template < typename T > @@ -422,25 +375,12 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) LogicError ("Must detach before reattaching."); + const Grid & g = Z.Grid (); + if (type == LOCAL_TO_GLOBAL) { attachedForLocalToGlobal_ = true; localToGlobalMat_ = &Z; -//TODO Is this the right place to prepost? -#ifdef EL_PREPOST_MESSAGES -//TODO Make max-small-messages configurable, can we calculate an upper bound? - max_preposted_messages = Z.Grid ().Size () * 100; //function of the grid size - prepost_reqs = new mpi::Request[max_preposted_messages]; - - for (int i = 0; i < max_preposted_messages; i++) - { - //resize to default prepost size - recvVector_.resize (PREPOST_THRESHOLD); - recvBuffer = recvVector_.data (); - mpi::TaggedIRecv (recvBuffer, PREPOST_THRESHOLD, mpi::ANY_SOURCE, - mpi::ANY_TAG, g.VCComm (), &prepost_reqs[i]); - } -#endif } else { @@ -484,7 +424,9 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo attachedForGlobalToLocal_ = true; globalToLocalMat_ = &X; } - +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) + all_sends_are_finished = false; +#endif const Int p = X.Grid ().Size (); sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); @@ -605,30 +547,9 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo thisSendCol[s] = thisXCol[colShift + s * r]; } // Fire off the non-blocking send -#ifdef EL_PREPOST_MESSAGES - if (bufferSize <= PREPOST_THRESHOLD) - { - mpi::TaggedISSend - (sendBuffer, bufferSize, destination, SMALL_DATA_TAG, - g.VCComm (), dataSendRequests_[destination][index]); - } - else - { - //SMALL_DATA_TAG - mpi::TaggedISSend - (sendBuffer, PREPOST_THRESHOLD, destination, MORE_DATA_TAG, - g.VCComm (), dataSendRequests_[destination][index]); - //remaining data using MORE_DATA_TAG - mpi::TaggedISSend - (sendBuffer + PREPOST_THRESHOLD, - bufferSize - PREPOST_THRESHOLD, destination, DATA_TAG, - g.VCComm (), dataSendRequests_[destination][index]); - } -#else mpi::TaggedISSend (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), dataSendRequests_[destination][index]); -#endif } #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) all_sends_are_finished = true; @@ -801,49 +722,54 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Detach")) if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) - LogicError ("Must attach before detaching."); - -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) - DONE = false; - nb_bar_active = false; -#endif + LogicError ("Must attach before detaching."); + const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : globalToLocalMat_-> - Grid ()); + localToGlobalMat_->Grid () : globalToLocalMat_-> + Grid ()); + + if (attachedForLocalToGlobal_) + { #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) - while (!DONE) + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + while (!DONE) + { + HandleLocalToGlobalData (); + if (nb_bar_active) + { + // test for IBarrier completion + DONE = mpi::Test (nb_bar_request); + } + else + { + if (all_sends_are_finished) + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier (g.VCComm (), nb_bar_request); + nb_bar_active = true; + } + } + } #else - while (!Finished ()) -#endif - { - if (attachedForLocalToGlobal_) - HandleLocalToGlobalData (); - else - HandleGlobalToLocalRequest (); -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) - if (nb_bar_active) - { - // test for IBarrier completion - DONE = mpi::Test (nb_bar_request); - } - else - { - if (all_sends_are_finished) - { - // all ssends are complete, start nonblocking barrier - mpi::IBarrier (g.VCComm (), nb_bar_request); - nb_bar_active = true; - } - } -#else - HandleEoms (); -#endif - } - -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) -#else - mpi::Barrier (g.VCComm ()); + while (!Finished ()) + { + HandleLocalToGlobalData (); + HandleEoms (); + } + mpi::Barrier (g.VCComm ()); #endif + } + else + { + while (!Finished ()) + { + HandleGlobalToLocalRequest (); + HandleEoms (); + } + mpi::Barrier (g.VCComm ()); + } attachedForLocalToGlobal_ = false; attachedForGlobalToLocal_ = false; @@ -865,10 +791,6 @@ template < typename T > AxpyInterface < T >::AxpyInterface ():attachedForLocalTo replySendRequests_.clear (); eomSendRequests_.clear (); - -#ifdef EL_PREPOST_MESSAGES - delete[]prepost_reqs; -#endif } template class AxpyInterface < Int >; diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 3b6b8938af..11a17ccf06 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -12,19 +12,31 @@ http://opensource.org/licenses/BSD-2-Clause */ #include "El-lite.hpp" + +// This is direct copy-paste from +// El two-sided implementation with +// point-to-point replaced by one-sided +// If you're seeing this then at this +// point I just want to compile + +#if MPI_VERSION>=3 namespace El { + // dont care about const + // interfaces now template RmaInterface::RmaInterface( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) + const Int p = Z.Grid ().Size (); } template RmaInterface::RmaInterface( const DistMatrix& X ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) + const Int p = X.Grid ().Size (); } template @@ -54,6 +66,7 @@ template void RmaInterface::Attach( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) + } template @@ -63,13 +76,81 @@ void RmaInterface::Attach( const DistMatrix& X ) } template -void RmaInterface::Put( Matrix& Z, Int i, Int j ) +void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) + + DistMatrix& Y = *localToGlobalMat_; + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // put local matrix cells in + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step(head) = i; head += sizeof(Int); + *reinterpret_cast(head) = j; head += sizeof(Int); + *reinterpret_cast(head) = height; head += sizeof(Int); + *reinterpret_cast(head) = width; head += sizeof(Int); + *reinterpret_cast(head) = alpha; head += sizeof(T); + + // Pack the payload + // consider ddt here + T* sendData = reinterpret_cast(head); + const T* XBuffer = Y.LockedBuffer(); + const Int XLDim = Y.LDim(); + for( Int t=0; t -void RmaInterface::Put( const Matrix& Z, Int i, Int j ) +void RmaInterface::Put( T alpha, const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) } @@ -78,6 +159,62 @@ template void RmaInterface::Get( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) + const DistMatrix < T > &X = *globalToLocalMat_; + + const Int height = Z.Height (); + const Int width = Z.Width (); + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid AxpyGlobalToLocal submatrix"); + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + + for (Int rank = 0; rank < p; ++rank) + { + // this is horrendously wrong, but + // just for compiling + const Int buffersize = height * width * sizeof(T); + getVector_.resize (buffersize); + byte *getBuffer = getVector_.data (); + // how do we know the data size + mpi::Iget (getBuffer, buffersize, rank, buffersize, window); + // Extract the header + byte *head = getBuffer; + const Int i = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int j = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int height = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int width = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const T alpha = *reinterpret_cast < const T * >(head); + head += sizeof (T); + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(head); + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += alpha * XCol[s]; + } + } } template @@ -86,18 +223,93 @@ void RmaInterface::Get( const Matrix& Z, Int i, Int j ) DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) } +// scaled accumulate template -void RmaInterface::Acc( Matrix& Z, Int i, Int j ) +void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + + DistMatrix& Y = *localToGlobalMat_; + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // put local matrix cells in + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step(head) = i; head += sizeof(Int); + *reinterpret_cast(head) = j; head += sizeof(Int); + *reinterpret_cast(head) = height; head += sizeof(Int); + *reinterpret_cast(head) = width; head += sizeof(Int); + *reinterpret_cast(head) = alpha; head += sizeof(T); + + // Pack the payload + // consider ddt here + T* sendData = reinterpret_cast(head); + const T* XBuffer = Z.LockedBuffer(); + const Int XLDim = Z.LDim(); + for( Int t=0; t -void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) +void RmaInterface::Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) } +template +void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) +} + template void RmaInterface::Detach() { @@ -111,3 +323,4 @@ template class RmaInterface>; template class RmaInterface>; } // namespace El +#endif diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index ae6964e22e..7474570bd8 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -1,2849 +1,4284 @@ /* Copyright (c) 2009-2014, Jack Poulson - 2013, Jeff Hammond - 2013, Jed Brown + 2013, Jeff Hammond + 2013, Jed Brown + 2014, Sayan Ghosh All rights reserved. - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at - http://opensource.org/licenses/BSD-2-Clause + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause */ #include "El-lite.hpp" -typedef unsigned char* UCP; +typedef unsigned char *UCP; -namespace { +namespace +{ -inline void -SafeMpi( int mpiError ) +inline void SafeMpi (int mpiError) { - DEBUG_ONLY( - if( mpiError != MPI_SUCCESS ) - { - char errorString[200]; - int lengthOfErrorString; - MPI_Error_string( mpiError, errorString, &lengthOfErrorString ); - El::RuntimeError( std::string(errorString) ); - } - ) + DEBUG_ONLY (if (mpiError != MPI_SUCCESS) +{ + char errorString[200]; + int lengthOfErrorString; + MPI_Error_string (mpiError, errorString, + &lengthOfErrorString); + El::RuntimeError (std::string (errorString)); + }) } +} // anonymous namespace -} // anonymous namespace - -namespace El { -namespace mpi { +namespace El +{ +namespace mpi +{ // MPI environmental routines // ========================== -void Initialize( int& argc, char**& argv ) -{ MPI_Init( &argc, &argv ); } +void Initialize (int &argc, char **&argv) +{ + MPI_Init (&argc, &argv); +} -int InitializeThread( int& argc, char**& argv, int required ) -{ - int provided; +int InitializeThread (int &argc, char **&argv, + int required) +{ + int provided; #ifdef EL_HAVE_MPI_INIT_THREAD - MPI_Init_thread( &argc, &argv, required, &provided ); + MPI_Init_thread (&argc, &argv, required, &provided); #else - MPI_Init( &argc, &argv ); - provided = 0; // equivalent to MPI_THREAD_SINGLE + MPI_Init (&argc, &argv); + provided = 0; // equivalent to MPI_THREAD_SINGLE #endif return provided; } -void Finalize() -{ MPI_Finalize(); } +void Finalize () +{ + MPI_Finalize (); +} -bool Initialized() -{ +bool Initialized () +{ int initialized; - MPI_Initialized( &initialized ); + + MPI_Initialized (&initialized); return initialized; } -bool Finalized() +bool Finalized () { int finalized; - MPI_Finalized( &finalized ); + + MPI_Finalized (&finalized); return finalized; } -int QueryThread() +int QueryThread () { int provided; + #ifdef EL_HAVE_MPI_QUERY_THREAD - MPI_Query_thread( &provided ); + MPI_Query_thread (&provided); #else - provided = 0; // equivalent to MPI_THREAD_SINGLE + provided = 0; // equivalent to MPI_THREAD_SINGLE #endif return provided; } -void Abort( Comm comm, int errCode ) -{ MPI_Abort( comm.comm, errCode ); } +void Abort (Comm comm, int errCode) +{ + MPI_Abort (comm.comm, errCode); +} -double Time() -{ return MPI_Wtime(); } +double Time () +{ + return MPI_Wtime (); +} -void Create( UserFunction* func, bool commutes, Op& op ) +void Create (UserFunction * func, bool commutes, Op & op) { - DEBUG_ONLY(CallStackEntry cse("mpi::Create")) - SafeMpi( MPI_Op_create( func, commutes, &op.op ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Create")) + SafeMpi (MPI_Op_create (func, commutes, &op.op)); } -void Free( Op& op ) +void Free (Op & op) { - DEBUG_ONLY(CallStackEntry cse("mpi::Free")) - SafeMpi( MPI_Op_free( &op.op ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Free")) + SafeMpi (MPI_Op_free (&op.op)); } -// Communicator manipulation +// Communicator manipulation // ========================= -int WorldRank() +int WorldRank () { - DEBUG_ONLY(CallStackEntry cse("mpi::WorldRank")) - return Rank( mpi::COMM_WORLD ); + DEBUG_ONLY (CallStackEntry cse ("mpi::WorldRank")) + return Rank (mpi::COMM_WORLD); } -int Rank( Comm comm ) +int Rank (Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Rank")) - if( comm != COMM_NULL ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Rank")) + if (comm != COMM_NULL) { int rank; - SafeMpi( MPI_Comm_rank( comm.comm, &rank ) ); + + SafeMpi (MPI_Comm_rank (comm.comm, &rank)); return rank; } - else return mpi::UNDEFINED; + else + return mpi::UNDEFINED; } -int Size( Comm comm ) +int Size (Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Size")) - if( comm != COMM_NULL ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Size")) + if (comm != COMM_NULL) { int size; - SafeMpi( MPI_Comm_size( comm.comm, &size ) ); + + SafeMpi (MPI_Comm_size (comm.comm, &size)); return size; - } - else return mpi::UNDEFINED; + } + else + return mpi::UNDEFINED; } -void Create( Comm parentComm, Group subsetGroup, Comm& subsetComm ) +void Create (Comm parentComm, Group subsetGroup, + Comm & subsetComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Create")) - SafeMpi( - MPI_Comm_create( parentComm.comm, subsetGroup.group, &subsetComm.comm ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Create")) + SafeMpi (MPI_Comm_create + (parentComm.comm, subsetGroup.group, + &subsetComm.comm)); } -void Dup( Comm original, Comm& duplicate ) +void Dup (Comm original, Comm & duplicate) { - DEBUG_ONLY(CallStackEntry cse("mpi::Dup")) - SafeMpi( MPI_Comm_dup( original.comm, &duplicate.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Dup")) + SafeMpi (MPI_Comm_dup + (original.comm, &duplicate.comm)); } -void Split( Comm comm, int color, int key, Comm& newComm ) +void Split (Comm comm, int color, int key, Comm & newComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Split")) - SafeMpi( MPI_Comm_split( comm.comm, color, key, &newComm.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Split")) + SafeMpi (MPI_Comm_split + (comm.comm, color, key, &newComm.comm)); } -void Free( Comm& comm ) +void Free (Comm & comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Free")) - SafeMpi( MPI_Comm_free( &comm.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Free")) + SafeMpi (MPI_Comm_free (&comm.comm)); } -bool Congruent( Comm comm1, Comm comm2 ) +bool Congruent (Comm comm1, Comm comm2) { - DEBUG_ONLY(CallStackEntry cse("mpi::Congruent")) - int result; - SafeMpi( MPI_Comm_compare( comm1.comm, comm2.comm, &result ) ); - return ( result == MPI_IDENT || result == MPI_CONGRUENT ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Congruent")) int + result; + SafeMpi (MPI_Comm_compare + (comm1.comm, comm2.comm, &result)); + return (result == MPI_IDENT + || result == MPI_CONGRUENT); } -void ErrorHandlerSet( Comm comm, ErrorHandler errorHandler ) +void ErrorHandlerSet (Comm comm, + ErrorHandler errorHandler) { - DEBUG_ONLY(CallStackEntry cse("mpi::ErrorHandlerSet")) + DEBUG_ONLY (CallStackEntry + cse ("mpi::ErrorHandlerSet")) #ifdef EL_HAVE_MPI_COMM_SET_ERRHANDLER - SafeMpi( MPI_Comm_set_errhandler( comm.comm, errorHandler ) ); + SafeMpi (MPI_Comm_set_errhandler + (comm.comm, errorHandler)); #else - SafeMpi( MPI_Errhandler_set( comm.comm, errorHandler ) ); + SafeMpi (MPI_Errhandler_set + (comm.comm, errorHandler)); #endif } -// Cartesian communicator routines +// Cartesian communicator routines // =============================== void CartCreate -( Comm comm, int numDims, const int* dimensions, const int* periods, - bool reorder, Comm& cartComm ) +(Comm comm, int numDims, const int *dimensions, + const int *periods, bool reorder, Comm & cartComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::CartCreate")) + DEBUG_ONLY (CallStackEntry cse ("mpi::CartCreate")) SafeMpi - ( MPI_Cart_create - ( comm.comm, numDims, const_cast(dimensions), - const_cast(periods), reorder, &cartComm.comm ) ); + (MPI_Cart_create + (comm.comm, numDims, + const_cast < int *>(dimensions), + const_cast < int *>(periods), reorder, + &cartComm.comm)); } -void CartSub( Comm comm, const int* remainingDims, Comm& subComm ) +void CartSub (Comm comm, const int *remainingDims, + Comm & subComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::CartSub")) - SafeMpi( - MPI_Cart_sub - ( comm.comm, const_cast(remainingDims), &subComm.comm ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::CartSub")) + SafeMpi (MPI_Cart_sub + (comm.comm, + const_cast < int *>(remainingDims), + &subComm.comm)); } -// Group manipulation +// Group manipulation // ================== -int Rank( Group group ) +int Rank (Group group) { - DEBUG_ONLY(CallStackEntry cse("mpi::Rank")) - int rank; - SafeMpi( MPI_Group_rank( group.group, &rank ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Rank")) int + rank; + SafeMpi (MPI_Group_rank (group.group, &rank)); return rank; } -int Size( Group group ) +int Size (Group group) { - DEBUG_ONLY(CallStackEntry cse("mpi::Size")) - int size; - SafeMpi( MPI_Group_size( group.group, &size ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Size")) int + size; + SafeMpi (MPI_Group_size (group.group, &size)); return size; } -void CommGroup( Comm comm, Group& group ) +void CommGroup (Comm comm, Group & group) { - DEBUG_ONLY(CallStackEntry cse("mpi::CommGroup")) - SafeMpi( MPI_Comm_group( comm.comm, &group.group ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::CommGroup")) + SafeMpi (MPI_Comm_group + (comm.comm, &group.group)); } -void Dup( Group group, Group& newGroup ) +void Dup (Group group, Group & newGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Dup")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Dup")) // For some reason, MPI_Group_dup does not exist - Excl( group, 0, 0, newGroup ); + Excl (group, 0, 0, newGroup); } -void Union( Group groupA, Group groupB, Group& newGroup ) +void Union (Group groupA, Group groupB, Group & newGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Union")) - SafeMpi( MPI_Group_union( groupA.group, groupB.group, &newGroup.group ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Union")) + SafeMpi (MPI_Group_union + (groupA.group, groupB.group, + &newGroup.group)); } -void Incl( Group group, int n, const int* ranks, Group& subGroup ) +void Incl (Group group, int n, const int *ranks, + Group & subGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Incl")) - SafeMpi( - MPI_Group_incl - ( group.group, n, const_cast(ranks), &subGroup.group ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Incl")) + SafeMpi (MPI_Group_incl + (group.group, n, + const_cast < int *>(ranks), + &subGroup.group)); } -void Excl( Group group, int n, const int* ranks, Group& subGroup ) +void Excl (Group group, int n, const int *ranks, + Group & subGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Excl")) - SafeMpi( - MPI_Group_excl - ( group.group, n, const_cast(ranks), &subGroup.group ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Excl")) + SafeMpi (MPI_Group_excl + (group.group, n, + const_cast < int *>(ranks), + &subGroup.group)); } -void Difference( Group parent, Group subset, Group& complement ) +void Difference (Group parent, Group subset, + Group & complement) { - DEBUG_ONLY(CallStackEntry cse("mpi::Difference")) - SafeMpi( - MPI_Group_difference( parent.group, subset.group, &complement.group ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Difference")) + SafeMpi (MPI_Group_difference + (parent.group, subset.group, + &complement.group)); } -void Free( Group& group ) +void Free (Group & group) { - DEBUG_ONLY(CallStackEntry cse("mpi::Free")) - SafeMpi( MPI_Group_free( &group.group ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Free")) + SafeMpi (MPI_Group_free (&group.group)); } // Rank translations // ================= -int Translate( Group origGroup, int origRank, Group newGroup ) +int Translate (Group origGroup, int origRank, + Group newGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - int newRank; - Translate( origGroup, 1, &origRank, newGroup, &newRank ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) int + newRank; + Translate (origGroup, 1, &origRank, newGroup, + &newRank); return newRank; } -int Translate( Comm origComm, int origRank, Group newGroup ) +int Translate (Comm origComm, int origRank, + Group newGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - int newRank; - Translate( origComm, 1, &origRank, newGroup, &newRank ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) int + newRank; + Translate (origComm, 1, &origRank, newGroup, + &newRank); return newRank; } -int Translate( Group origGroup, int origRank, Comm newComm ) +int Translate (Group origGroup, int origRank, + Comm newComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - int newRank; - Translate( origGroup, 1, &origRank, newComm, &newRank ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) int + newRank; + Translate (origGroup, 1, &origRank, newComm, + &newRank); return newRank; } -int Translate( Comm origComm, int origRank, Comm newComm ) +int Translate (Comm origComm, int origRank, Comm newComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - int newRank; - Translate( origComm, 1, &origRank, newComm, &newRank ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) int + newRank; + Translate (origComm, 1, &origRank, newComm, &newRank); return newRank; } void Translate -( Group origGroup, int size, const int* origRanks, - Group newGroup, int* newRanks ) +(Group origGroup, int size, const int *origRanks, + Group newGroup, int *newRanks) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) SafeMpi - ( MPI_Group_translate_ranks - ( origGroup.group, size, const_cast(origRanks), - newGroup.group, newRanks ) ); + (MPI_Group_translate_ranks + (origGroup.group, size, + const_cast < int *>(origRanks), + newGroup.group, newRanks)); } void Translate -( Comm origComm, int size, const int* origRanks, - Group newGroup, int* newRanks ) +(Comm origComm, int size, const int *origRanks, + Group newGroup, int *newRanks) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) Group origGroup; - CommGroup( origComm, origGroup ); - Translate( origGroup, size, origRanks, newGroup, newRanks ); - Free( origGroup ); + + CommGroup (origComm, origGroup); + Translate (origGroup, size, origRanks, newGroup, + newRanks); + Free (origGroup); } void Translate -( Group origGroup, int size, const int* origRanks, - Comm newComm, int* newRanks ) +(Group origGroup, int size, const int *origRanks, + Comm newComm, int *newRanks) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) Group newGroup; - CommGroup( newComm, newGroup ); - Translate( origGroup, size, origRanks, newGroup, newRanks ); - Free( newGroup ); + + CommGroup (newComm, newGroup); + Translate (origGroup, size, origRanks, newGroup, + newRanks); + Free (newGroup); } void Translate -( Comm origComm, int size, const int* origRanks, - Comm newComm, int* newRanks ) +(Comm origComm, int size, const int *origRanks, + Comm newComm, int *newRanks) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) Group origGroup, newGroup; - CommGroup( origComm, origGroup ); - CommGroup( newComm, newGroup ); - Translate( origGroup, size, origRanks, newGroup, newRanks ); - Free( origGroup ); - Free( newGroup ); + + CommGroup (origComm, origGroup); + CommGroup (newComm, newGroup); + Translate (origGroup, size, origRanks, newGroup, + newRanks); + Free (origGroup); + Free (newGroup); +} + +// MPI-3 RMA functions +// ================== + +#if MPI_VERSION>=3 +void SetWindowProp (Window & window, int prop) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::SetWindowProp")) + Info info; + + SafeMpi (MPI_Info_create (&info)); + + if (prop & (1 << 0)) // strict + SafeMpi (MPI_Info_set + (info, "accumulate_ordering", + "rar,raw,war,waw")); + + + if (prop & (1 << 1)) // partial + SafeMpi (MPI_Info_set + (info, "accumulate_ordering", + "rar,waw")); + + if (prop & (1 << 2)) // none + SafeMpi (MPI_Info_set + (info, "accumulate_ops", + "same_op_no_op")); + + SafeMpi (MPI_Win_set_info (window, info)); } +//NOTE assuming MPI_MODE_NOCHECK +void WindowLock (int rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowLock")) + SafeMpi (MPI_Win_lock + (MPI_LOCK_SHARED, rank, MPI_MODE_NOCHECK, + window)); +} + +void WindowLock (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowLock")) + SafeMpi (MPI_Win_lock_all + (MPI_MODE_NOCHECK, window)); +} + +void WindowUnlock (int rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowUnlock")) + SafeMpi (MPI_Win_unlock (rank, window)); +} + +void WindowUnlock (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowUnlock")) + SafeMpi (MPI_Win_unlock_all (window)); +} + +void WindowCreate (int size, Comm comm, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Windowcreate")) + void *baseptr; + + SafeMpi (MPI_Win_allocate + ((MPI_Aint) size, 1, MPI_INFO_NULL, comm.comm, + baseptr, &window)); +#ifdef EL_NO_ACC_ORDERING + SetWindowProp (window, NO_ACC_ORDERING); +#endif + SafeMpi (MPI_Barrier (comm.comm)); +} + +void WindowCreate (int size, Info info, Comm comm, + Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Windowcreate")) + void *baseptr; + + SafeMpi (MPI_Win_allocate + ((MPI_Aint) size, 1, info, comm.comm, baseptr, + &window)); + SafeMpi (MPI_Barrier (comm.comm)); +} + +void Iput (void *source, int source_size, int target_rank, + int target_size, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iput")) +#ifdef EL_ENSURE_PUT_ATOMICITY + SafeMpi (MPI_Accumulate + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, 1, (MPI_Aint) target_size, + MPI_BYTE, MPI_REPLACE, window)); +#else + SafeMpi (MPI_Put + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, 1, (MPI_Aint) target_size, + MPI_BYTE, window)); +#endif +} + +void Rput (void *source, int source_size, int target_rank, + int target_size, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) +#ifdef EL_ENSURE_PUT_ATOMICITY + SafeMpi (MPI_Raccumulate + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, 1, (MPI_Aint) target_size, + MPI_BYTE, MPI_REPLACE, window, &request)); +#else + SafeMpi (MPI_Rput + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, 1, (MPI_Aint) target_size, + MPI_BYTE, window, &request)); +#endif +} + +void Iget (void *source, int source_size, int target_rank, + int target_size, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) +#ifdef EL_ENSURE_GET_ATOMICITY + SafeMpi (MPI_Get_accumulate + (NULL, 0, MPI_BYTE, source, + (MPI_Aint) source_size, MPI_BYTE, + target_rank, 1, (MPI_Aint) target_size, + MPI_BYTE, MPI_NO_OP, window)); +#else + SafeMpi (MPI_Get + (source, (MPI_Aint) source_size, 1, + target_rank, 1, (MPI_Aint) target_size, + MPI_BYTE, window)); +#endif +} + +void Rget (void *source, int source_size, int target_rank, + int target_size, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) +#ifdef EL_ENSURE_GET_ATOMICITY + SafeMpi (MPI_Rget_accumulate + (NULL, 0, MPI_BYTE, source, + (MPI_Aint) source_size, MPI_BYTE, + target_rank, 1, (MPI_Aint) target_size, + MPI_BYTE, MPI_NO_OP, window, &request)); +#else + SafeMpi (MPI_Rget + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, 1, (MPI_Aint) target_size, + MPI_BYTE, window, &request)); +#endif +} + +void Iacc (void *source, int source_size, int target_rank, + int target_size, Op & op, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) + SafeMpi (MPI_Accumulate + (source, (MPI_Aint) source_size, + MPI_BYTE, target_rank, 1, + (MPI_Aint) target_size, MPI_BYTE, op.op, + window)); +} + +void Racc (void *source, int source_size, int target_rank, + int target_size, Op & op, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) + SafeMpi (MPI_Raccumulate + (source, (MPI_Aint) source_size, + MPI_BYTE, target_rank, 1, + (MPI_Aint) target_size, MPI_BYTE, op.op, + window, &request)); +} + +void Flush (int target_rank, Window & window, + bool isLocalCompletion) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) + if (isLocalCompletion) + SafeMpi (MPI_Win_flush_local + (target_rank, window)); + else + SafeMpi (MPI_Win_flush (target_rank, window)); +} + +void Flush (Window & window, bool isLocalCompletion) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) + if (isLocalCompletion) + SafeMpi (MPI_Win_flush_local_all (window)); + else + SafeMpi (MPI_Win_flush_all (window)); +} +#endif + // Various utilities // ================= // Wait until every process in comm reaches this statement -void Barrier( Comm comm ) +void Barrier (Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Barrier")) - SafeMpi( MPI_Barrier( comm.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Barrier")) + SafeMpi (MPI_Barrier (comm.comm)); } #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) -void IBarrier( Comm comm, Request& request ) +void IBarrier (Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IBarrier")) - SafeMpi( MPI_Ibarrier( comm.comm, &request ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::IBarrier")) + SafeMpi (MPI_Ibarrier (comm.comm, &request)); } #endif + // Test for completion -bool Test( Request& request ) +bool Test (Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::Test")) - Status status; + DEBUG_ONLY (CallStackEntry cse ("mpi::Test")) Status + status; int flag; - SafeMpi( MPI_Test( &request, &flag, &status ) ); + + SafeMpi (MPI_Test (&request, &flag, &status)); return flag; } -bool Test( Request& request, Status& status ) +bool Test (Request & request, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::Test")) - int flag; - SafeMpi( MPI_Test( &request, &flag, &status ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Test")) int + flag; + SafeMpi (MPI_Test (&request, &flag, &status)); return flag; } -// Ensure that the request finishes before continuing -void Wait( Request& request ) +bool Testany (int count, Request * requests, int &indx, + Status & status) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) int + flag; + SafeMpi (MPI_Testany + (count, requests, &indx, &flag, &status)); + return flag; +} + +bool Testany (int count, Request * requests, int &indx) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) int + flag; + Status status; + + SafeMpi (MPI_Testany + (count, requests, &indx, &flag, &status)); + return flag; +} + +bool Testany (int count, Request * requests) { - DEBUG_ONLY(CallStackEntry cse("mpi::Wait")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) int + flag, indx; Status status; - SafeMpi( MPI_Wait( &request, &status ) ); + + SafeMpi (MPI_Testany + (count, requests, &indx, &flag, &status)); + return flag; +} + +// Ensure that the request finishes before continuing +void Wait (Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Wait")) Status + status; + SafeMpi (MPI_Wait (&request, &status)); } // Ensure that the request finishes before continuing -void Wait( Request& request, Status& status ) +void Wait (Request & request, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::Wait")) - SafeMpi( MPI_Wait( &request, &status ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Wait")) + SafeMpi (MPI_Wait (&request, &status)); } // Ensure that several requests finish before continuing -void WaitAll( int numRequests, Request* requests ) +void WaitAll (int numRequests, Request * requests) { - DEBUG_ONLY(CallStackEntry cse("mpi::WaitAll")) - std::vector statuses( numRequests ); - SafeMpi( MPI_Waitall( numRequests, requests, statuses.data() ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::WaitAll")) + std::vector < Status > statuses (numRequests); + SafeMpi (MPI_Waitall + (numRequests, requests, statuses.data ())); } // Ensure that several requests finish before continuing -void WaitAll( int numRequests, Request* requests, Status* statuses ) +void WaitAll (int numRequests, Request * requests, + Status * statuses) { - DEBUG_ONLY(CallStackEntry cse("mpi::WaitAll")) - SafeMpi( MPI_Waitall( numRequests, requests, statuses ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::WaitAll")) + SafeMpi (MPI_Waitall + (numRequests, requests, statuses)); } // Nonblocking test for message completion -bool IProbe( int source, int tag, Comm comm, Status& status ) +bool IProbe (int source, int tag, Comm comm, + Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::IProbe")) - int flag; - SafeMpi( MPI_Iprobe( source, tag, comm.comm, &flag, &status ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::IProbe")) int + flag; + SafeMpi (MPI_Iprobe + (source, tag, comm.comm, &flag, &status)); return flag; } -bool IProbe( int source, Comm comm, Status& status ) -{ return IProbe( source, 0, comm, status ); } +bool IProbe (int source, Comm comm, Status & status) +{ + return IProbe (source, 0, comm, status); +} -template -int GetCount( Status& status ) +template < typename T > int GetCount (Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::GetCount")) - int count; - SafeMpi( MPI_Get_count( &status, TypeMap(), &count ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::GetCount")) int + count; + SafeMpi (MPI_Get_count + (&status, TypeMap < T > (), &count)); return count; } -template int GetCount( Status& status ); -template int GetCount( Status& status ); -template int GetCount( Status& status ); -template int GetCount( Status& status ); -template int GetCount( Status& status ); +template int GetCount < byte > (Status & status); +template int GetCount < int >(Status & status); +template int GetCount < unsigned >(Status & status); +template int GetCount < long int >(Status & status); +template int GetCount < unsigned long >(Status & status); + #ifdef EL_HAVE_MPI_LONG_LONG -template int GetCount( Status& status ); -template int GetCount( Status& status ); +template int GetCount < long long int >(Status & status); +template int GetCount < +unsigned long long >(Status & status); #endif -template int GetCount( Status& status ); -template int GetCount( Status& status ); -template int GetCount>( Status& status ); -template int GetCount>( Status& status ); - -template -void TaggedSend( const R* buf, int count, int to, int tag, Comm comm ) -{ - DEBUG_ONLY(CallStackEntry cse("mpi::Send")) - SafeMpi( - MPI_Send( const_cast(buf), count, TypeMap(), to, tag, comm.comm ) - ); +template int GetCount < float >(Status & status); +template int GetCount < double >(Status & status); +template int GetCount < Complex < +float >>(Status & status); +template int GetCount < Complex < +double >>(Status & status); + +template < typename R > +void TaggedSend (const R * buf, int count, int to, + int tag, Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Send")) + SafeMpi (MPI_Send + (const_cast < R * >(buf), count, + TypeMap < R > (), to, tag, comm.comm)); } -template -void TaggedSend( const Complex* buf, int count, int to, int tag, Comm comm ) +template < typename R > +void TaggedSend (const Complex < R > *buf, int count, + int to, int tag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Send")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Send")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Send - ( const_cast*>(buf), 2*count, TypeMap(), to, - tag, comm.comm ) ); + (MPI_Send + (const_cast < Complex < R > *>(buf), 2 * count, + TypeMap < R > (), to, tag, comm.comm)); #else SafeMpi - ( MPI_Send - ( const_cast*>(buf), count, - TypeMap>(), to, tag, comm.comm ) ); + (MPI_Send + (const_cast < Complex < R > *>(buf), count, + TypeMap < Complex < R >> (), to, tag, + comm.comm)); #endif } -template void TaggedSend( const byte* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const int* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const unsigned* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const long int* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const unsigned long* buf, int count, int to, int tag, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedSend( const long long int* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const unsigned long long* buf, int count, int to, int tag, Comm comm ); -#endif -template void TaggedSend( const float* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const double* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const Complex* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const Complex* buf, int count, int to, int tag, Comm comm ); - -template -void Send( const T* buf, int count, int to, Comm comm ) -{ TaggedSend( buf, count, to, 0, comm ); } - -template void Send( const byte* buf, int count, int to, Comm comm ); -template void Send( const int* buf, int count, int to, Comm comm ); -template void Send( const unsigned* buf, int count, int to, Comm comm ); -template void Send( const long int* buf, int count, int to, Comm comm ); -template void Send( const unsigned long* buf, int count, int to, Comm comm ); +template void TaggedSend (const byte * buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const int *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const unsigned *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const long int *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const unsigned long *buf, + int count, int to, int tag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Send( const long long int* buf, int count, int to, Comm comm ); -template void Send( const unsigned long long* buf, int count, int to, Comm comm ); -#endif -template void Send( const float* buf, int count, int to, Comm comm ); -template void Send( const double* buf, int count, int to, Comm comm ); -template void Send( const Complex* buf, int count, int to, Comm comm ); -template void Send( const Complex* buf, int count, int to, Comm comm ); - -template -void TaggedSend( T b, int to, int tag, Comm comm ) -{ TaggedSend( &b, 1, to, tag, comm ); } - -template void TaggedSend( byte b, int to, int tag, Comm comm ); -template void TaggedSend( int b, int to, int tag, Comm comm ); -template void TaggedSend( unsigned b, int to, int tag, Comm comm ); -template void TaggedSend( long int b, int to, int tag, Comm comm ); -template void TaggedSend( unsigned long b, int to, int tag, Comm comm ); +template void TaggedSend (const long long int *buf, + int count, int to, int tag, + Comm comm); +template void TaggedSend (const unsigned long long *buf, + int count, int to, int tag, + Comm comm); +#endif +template void TaggedSend (const float *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const double *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const Complex < float >*buf, + int count, int to, int tag, + Comm comm); +template void TaggedSend (const Complex < double >*buf, + int count, int to, int tag, + Comm comm); + +template < typename T > +void Send (const T * buf, int count, int to, + Comm comm) +{ + TaggedSend (buf, count, to, 0, comm); +} + +template void Send (const byte * buf, int count, int to, + Comm comm); +template void Send (const int *buf, int count, int to, + Comm comm); +template void Send (const unsigned *buf, int count, + int to, Comm comm); +template void Send (const long int *buf, int count, + int to, Comm comm); +template void Send (const unsigned long *buf, int count, + int to, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedSend( long long int b, int to, int tag, Comm comm ); -template void TaggedSend( unsigned long long b, int to, int tag, Comm comm ); -#endif -template void TaggedSend( float b, int to, int tag, Comm comm ); -template void TaggedSend( double b, int to, int tag, Comm comm ); -template void TaggedSend( Complex b, int to, int tag, Comm comm ); -template void TaggedSend( Complex b, int to, int tag, Comm comm ); - -template -void Send( T b, int to, Comm comm ) -{ TaggedSend( b, to, 0, comm ); } - -template void Send( byte b, int to, Comm comm ); -template void Send( int b, int to, Comm comm ); -template void Send( unsigned b, int to, Comm comm ); -template void Send( long int b, int to, Comm comm ); -template void Send( unsigned long b, int to, Comm comm ); +template void Send (const long long int *buf, int count, + int to, Comm comm); +template void Send (const unsigned long long *buf, + int count, int to, Comm comm); +#endif +template void Send (const float *buf, int count, int to, + Comm comm); +template void Send (const double *buf, int count, int to, + Comm comm); +template void Send (const Complex < float >*buf, + int count, int to, Comm comm); +template void Send (const Complex < double >*buf, + int count, int to, Comm comm); + +template < typename T > +void TaggedSend (T b, int to, int tag, Comm comm) +{ + TaggedSend (&b, 1, to, tag, comm); +} + +template void TaggedSend (byte b, int to, int tag, + Comm comm); +template void TaggedSend (int b, int to, int tag, + Comm comm); +template void TaggedSend (unsigned b, int to, int tag, + Comm comm); +template void TaggedSend (long int b, int to, int tag, + Comm comm); +template void TaggedSend (unsigned long b, int to, + int tag, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Send( long long int b, int to, Comm comm ); -template void Send( unsigned long long b, int to, Comm comm ); +template void TaggedSend (long long int b, int to, + int tag, Comm comm); +template void TaggedSend (unsigned long long b, int to, + int tag, Comm comm); #endif -template void Send( float b, int to, Comm comm ); -template void Send( double b, int to, Comm comm ); -template void Send( Complex b, int to, Comm comm ); -template void Send( Complex b, int to, Comm comm ); +template void TaggedSend (float b, int to, int tag, + Comm comm); +template void TaggedSend (double b, int to, int tag, + Comm comm); +template void TaggedSend (Complex < float >b, int to, + int tag, Comm comm); +template void TaggedSend (Complex < double >b, int to, + int tag, Comm comm); + +template < typename T > void Send (T b, int to, Comm comm) +{ + TaggedSend (b, to, 0, comm); +} + +template void Send (byte b, int to, Comm comm); +template void Send (int b, int to, Comm comm); +template void Send (unsigned b, int to, Comm comm); +template void Send (long int b, int to, Comm comm); +template void Send (unsigned long b, int to, Comm comm); -template +#ifdef EL_HAVE_MPI_LONG_LONG +template void Send (long long int b, int to, Comm comm); +template void Send (unsigned long long b, int to, + Comm comm); +#endif +template void Send (float b, int to, Comm comm); +template void Send (double b, int to, Comm comm); +template void Send (Complex < float >b, int to, + Comm comm); +template void Send (Complex < double >b, int to, + Comm comm); + +template < typename R > void TaggedISend -( const R* buf, int count, int to, int tag, Comm comm, Request& request ) -{ - DEBUG_ONLY(CallStackEntry cse("mpi::ISend")) +(const R * buf, int count, int to, int tag, Comm comm, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::ISend")) SafeMpi - ( MPI_Isend - ( const_cast(buf), count, TypeMap(), to, - tag, comm.comm, &request ) ); + (MPI_Isend + (const_cast < R * >(buf), count, + TypeMap < R > (), to, tag, comm.comm, + &request)); } -template +template < typename R > void TaggedISend -( const Complex* buf, int count, int to, int tag, Comm comm, - Request& request ) +(const Complex < R > *buf, int count, int to, int tag, + Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::ISend")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ISend")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Isend - ( const_cast*>(buf), 2*count, TypeMap(), to, tag, comm.comm, - &request ) ); + (MPI_Isend + (const_cast < Complex < R > *>(buf), 2 * count, + TypeMap < R > (), to, tag, comm.comm, + &request)); #else SafeMpi - ( MPI_Isend - ( const_cast*>(buf), count, - TypeMap>(), to, tag, comm.comm, &request ) ); + (MPI_Isend + (const_cast < Complex < R > *>(buf), count, + TypeMap < Complex < R >> (), to, tag, comm.comm, + &request)); #endif } -template void TaggedISend( const byte* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const unsigned* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const long int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const unsigned long* buf, int count, int to, int tag, Comm comm, Request& request ); +template void TaggedISend (const byte * buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const int *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const unsigned *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const long int *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const unsigned long *buf, + int count, int to, int tag, + Comm comm, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedISend( const long long int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const unsigned long long* buf, int count, int to, int tag, Comm comm, Request& request ); +template void TaggedISend (const long long int *buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (const unsigned long long *buf, + int count, int to, int tag, + Comm comm, Request & request); #endif -template void TaggedISend( const float* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const double* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const Complex* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const Complex* buf, int count, int to, int tag, Comm comm, Request& request ); - -template +template void TaggedISend (const float *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const double *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const Complex < float >*buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (const Complex < double >*buf, + int count, int to, int tag, + Comm comm, Request & request); + +template < typename T > void ISend -( const T* buf, int count, int to, Comm comm, Request& request ) -{ TaggedISend( buf, count, to, 0, comm, request ); } - -template void ISend( const byte* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const int* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const unsigned* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const long int* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const unsigned long* buf, int count, int to, Comm comm, Request& request ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void ISend( const long long int* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const unsigned long long* buf, int count, int to, Comm comm, Request& request ); -#endif -template void ISend( const float* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const double* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const Complex* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const Complex* buf, int count, int to, Comm comm, Request& request ); - -template -void TaggedISend( T b, int to, int tag, Comm comm, Request& request ) -{ TaggedISend( &b, 1, to, tag, comm, request ); } - -template void TaggedISend( byte buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( int buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( unsigned buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( long int buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( unsigned long buf, int to, int tag, Comm comm, Request& request ); +(const T * buf, int count, int to, Comm comm, + Request & request) +{ + TaggedISend (buf, count, to, 0, comm, request); +} + +template void ISend (const byte * buf, int count, int to, + Comm comm, Request & request); +template void ISend (const int *buf, int count, int to, + Comm comm, Request & request); +template void ISend (const unsigned *buf, int count, + int to, Comm comm, + Request & request); +template void ISend (const long int *buf, int count, + int to, Comm comm, + Request & request); +template void ISend (const unsigned long *buf, int count, + int to, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedISend( long long int buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( unsigned long long buf, int to, int tag, Comm comm, Request& request ); -#endif -template void TaggedISend( float buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( double buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( Complex buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( Complex buf, int to, int tag, Comm comm, Request& request ); - -template -void ISend( T b, int to, Comm comm, Request& request ) -{ TaggedISend( b, to, 0, comm, request ); } - -template void ISend( byte buf, int to, Comm comm, Request& request ); -template void ISend( int buf, int to, Comm comm, Request& request ); -template void ISend( unsigned buf, int to, Comm comm, Request& request ); -template void ISend( long int buf, int to, Comm comm, Request& request ); -template void ISend( unsigned long buf, int to, Comm comm, Request& request ); +template void ISend (const long long int *buf, int count, + int to, Comm comm, + Request & request); +template void ISend (const unsigned long long *buf, + int count, int to, Comm comm, + Request & request); +#endif +template void ISend (const float *buf, int count, int to, + Comm comm, Request & request); +template void ISend (const double *buf, int count, int to, + Comm comm, Request & request); +template void ISend (const Complex < float >*buf, + int count, int to, Comm comm, + Request & request); +template void ISend (const Complex < double >*buf, + int count, int to, Comm comm, + Request & request); + +template < typename T > +void TaggedISend (T b, int to, int tag, Comm comm, + Request & request) +{ + TaggedISend (&b, 1, to, tag, comm, request); +} + +template void TaggedISend (byte buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (int buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (unsigned buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (long int buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (unsigned long buf, int to, + int tag, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void ISend( long long int buf, int to, Comm comm, Request& request ); -template void ISend( unsigned long long buf, int to, Comm comm, Request& request ); +template void TaggedISend (long long int buf, int to, + int tag, Comm comm, + Request & request); +template void TaggedISend (unsigned long long buf, int to, + int tag, Comm comm, + Request & request); #endif -template void ISend( float buf, int to, Comm comm, Request& request ); -template void ISend( double buf, int to, Comm comm, Request& request ); -template void ISend( Complex buf, int to, Comm comm, Request& request ); -template void ISend( Complex buf, int to, Comm comm, Request& request ); +template void TaggedISend (float buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (double buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (Complex < float >buf, int to, + int tag, Comm comm, + Request & request); +template void TaggedISend (Complex < double >buf, int to, + int tag, Comm comm, + Request & request); + +template < typename T > +void ISend (T b, int to, Comm comm, Request & request) +{ + TaggedISend (b, to, 0, comm, request); +} -template +template void ISend (byte buf, int to, Comm comm, + Request & request); +template void ISend (int buf, int to, Comm comm, + Request & request); +template void ISend (unsigned buf, int to, Comm comm, + Request & request); +template void ISend (long int buf, int to, Comm comm, + Request & request); +template void ISend (unsigned long buf, int to, Comm comm, + Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void ISend (long long int buf, int to, Comm comm, + Request & request); +template void ISend (unsigned long long buf, int to, + Comm comm, Request & request); +#endif +template void ISend (float buf, int to, Comm comm, + Request & request); +template void ISend (double buf, int to, Comm comm, + Request & request); +template void ISend (Complex < float >buf, int to, + Comm comm, Request & request); +template void ISend (Complex < double >buf, int to, + Comm comm, Request & request); + +template < typename R > void TaggedISSend -( const R* buf, int count, int to, int tag, Comm comm, Request& request ) +(const R * buf, int count, int to, int tag, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::ISSend")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ISSend")) SafeMpi - ( MPI_Issend - ( const_cast(buf), count, TypeMap(), to, - tag, comm.comm, &request ) ); + (MPI_Issend + (const_cast < R * >(buf), count, + TypeMap < R > (), to, tag, comm.comm, + &request)); } -template +template < typename R > void TaggedISSend -( const Complex* buf, int count, int to, int tag, Comm comm, - Request& request ) +(const Complex < R > *buf, int count, int to, int tag, + Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::ISSend")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ISSend")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Issend - ( const_cast*>(buf), 2*count, TypeMap(), to, tag, comm.comm, - &request ) ); + (MPI_Issend + (const_cast < Complex < R > *>(buf), 2 * count, + TypeMap < R > (), to, tag, comm.comm, + &request)); #else SafeMpi - ( MPI_Issend - ( const_cast*>(buf), count, - TypeMap>(), to, tag, comm.comm, &request ) ); + (MPI_Issend + (const_cast < Complex < R > *>(buf), count, + TypeMap < Complex < R >> (), to, tag, comm.comm, + &request)); #endif } -template void TaggedISSend( const byte* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const unsigned* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const long int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const unsigned long* buf, int count, int to, int tag, Comm comm, Request& request ); +template void TaggedISSend (const byte * buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISSend (const int *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISSend (const unsigned *buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (const long int *buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (const unsigned long *buf, + int count, int to, int tag, + Comm comm, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedISSend( const long long int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const unsigned long long* buf, int count, int to, int tag, Comm comm, Request& request ); -#endif -template void TaggedISSend( const float* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const double* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const Complex* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const Complex* buf, int count, int to, int tag, Comm comm, Request& request ); - -template -void ISSend( const T* buf, int count, int to, Comm comm, Request& request ) -{ TaggedISSend( buf, count, to, 0, comm, request ); } - -template void ISSend( const byte* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const int* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const unsigned* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const long int* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const unsigned long* buf, int count, int to, Comm comm, Request& request ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void ISSend( const long long int* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const unsigned long long* buf, int count, int to, Comm comm, Request& request ); -#endif -template void ISSend( const float* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const double* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const Complex* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const Complex* buf, int count, int to, Comm comm, Request& request ); - -template -void TaggedISSend( T b, int to, int tag, Comm comm, Request& request ) -{ TaggedISSend( &b, 1, to, tag, comm, request ); } - -template void TaggedISSend( byte b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( int b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( unsigned b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( long int b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( unsigned long b, int to, int tag, Comm comm, Request& request ); +template void TaggedISSend (const long long int *buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (const unsigned long long *buf, + int count, int to, int tag, + Comm comm, Request & request); +#endif +template void TaggedISSend (const float *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISSend (const double *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISSend (const Complex < float >*buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (const Complex < double >*buf, + int count, int to, int tag, + Comm comm, Request & request); + +template < typename T > +void ISSend (const T * buf, int count, int to, + Comm comm, Request & request) +{ + TaggedISSend (buf, count, to, 0, comm, request); +} + +template void ISSend (const byte * buf, int count, int to, + Comm comm, Request & request); +template void ISSend (const int *buf, int count, int to, + Comm comm, Request & request); +template void ISSend (const unsigned *buf, int count, + int to, Comm comm, + Request & request); +template void ISSend (const long int *buf, int count, + int to, Comm comm, + Request & request); +template void ISSend (const unsigned long *buf, int count, + int to, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedISSend( long long int b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( unsigned long long b, int to, int tag, Comm comm, Request& request ); +template void ISSend (const long long int *buf, int count, + int to, Comm comm, + Request & request); +template void ISSend (const unsigned long long *buf, + int count, int to, Comm comm, + Request & request); #endif -template void TaggedISSend( float b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( double b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( Complex b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( Complex b, int to, int tag, Comm comm, Request& request ); +template void ISSend (const float *buf, int count, int to, + Comm comm, Request & request); +template void ISSend (const double *buf, int count, + int to, Comm comm, + Request & request); +template void ISSend (const Complex < float >*buf, + int count, int to, Comm comm, + Request & request); +template void ISSend (const Complex < double >*buf, + int count, int to, Comm comm, + Request & request); + +template < typename T > +void TaggedISSend (T b, int to, int tag, Comm comm, + Request & request) +{ + TaggedISSend (&b, 1, to, tag, comm, request); +} -template -void TaggedRecv( R* buf, int count, int from, int tag, Comm comm ) +template void TaggedISSend (byte b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (int b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (unsigned b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (long int b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (unsigned long b, int to, + int tag, Comm comm, + Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void TaggedISSend (long long int b, int to, + int tag, Comm comm, + Request & request); +template void TaggedISSend (unsigned long long b, int to, + int tag, Comm comm, + Request & request); +#endif +template void TaggedISSend (float b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (double b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (Complex < float >b, int to, + int tag, Comm comm, + Request & request); +template void TaggedISSend (Complex < double >b, int to, + int tag, Comm comm, + Request & request); + +template < typename R > +void TaggedRecv (R * buf, int count, int from, + int tag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Recv")) - Status status; - SafeMpi - ( MPI_Recv( buf, count, TypeMap(), from, tag, comm.comm, &status ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) Status + status; + SafeMpi (MPI_Recv + (buf, count, TypeMap < R > (), from, tag, + comm.comm, &status)); } -template -void TaggedRecv( Complex* buf, int count, int from, int tag, Comm comm ) +template < typename R > +void TaggedRecv (Complex < R > *buf, int count, + int from, int tag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Recv")) - Status status; + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) Status + status; #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Recv( buf, 2*count, TypeMap(), from, tag, comm.comm, &status ) ); + (MPI_Recv + (buf, 2 * count, TypeMap < R > (), from, tag, + comm.comm, &status)); #else SafeMpi - ( MPI_Recv - ( buf, count, TypeMap>(), from, tag, comm.comm, &status ) ); + (MPI_Recv + (buf, count, TypeMap < Complex < R >> (), from, + tag, comm.comm, &status)); #endif } -template void TaggedRecv( byte* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( int* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( unsigned* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( long int* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( unsigned long* buf, int count, int from, int tag, Comm comm ); +template void TaggedRecv (byte * buf, int count, int from, + int tag, Comm comm); +template void TaggedRecv (int *buf, int count, int from, + int tag, Comm comm); +template void TaggedRecv (unsigned *buf, int count, + int from, int tag, Comm comm); +template void TaggedRecv (long int *buf, int count, + int from, int tag, Comm comm); +template void TaggedRecv (unsigned long *buf, int count, + int from, int tag, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedRecv( long long int* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( unsigned long long* buf, int count, int from, int tag, Comm comm ); -#endif -template void TaggedRecv( float* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( double* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( Complex* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( Complex* buf, int count, int from, int tag, Comm comm ); - -template -void Recv( T* buf, int count, int from, Comm comm ) -{ TaggedRecv( buf, count, from, mpi::ANY_TAG, comm ); } - -template void Recv( byte* buf, int count, int from, Comm comm ); -template void Recv( int* buf, int count, int from, Comm comm ); -template void Recv( unsigned* buf, int count, int from, Comm comm ); -template void Recv( long int* buf, int count, int from, Comm comm ); -template void Recv( unsigned long* buf, int count, int from, Comm comm ); +template void TaggedRecv (long long int *buf, int count, + int from, int tag, Comm comm); +template void TaggedRecv (unsigned long long *buf, + int count, int from, int tag, + Comm comm); +#endif +template void TaggedRecv (float *buf, int count, int from, + int tag, Comm comm); +template void TaggedRecv (double *buf, int count, + int from, int tag, Comm comm); +template void TaggedRecv (Complex < float >*buf, + int count, int from, int tag, + Comm comm); +template void TaggedRecv (Complex < double >*buf, + int count, int from, int tag, + Comm comm); + +template < typename T > +void Recv (T * buf, int count, int from, Comm comm) +{ + TaggedRecv (buf, count, from, mpi::ANY_TAG, comm); +} + +template void Recv (byte * buf, int count, int from, + Comm comm); +template void Recv (int *buf, int count, int from, + Comm comm); +template void Recv (unsigned *buf, int count, int from, + Comm comm); +template void Recv (long int *buf, int count, int from, + Comm comm); +template void Recv (unsigned long *buf, int count, + int from, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Recv( long long int* buf, int count, int from, Comm comm ); -template void Recv( unsigned long long* buf, int count, int from, Comm comm ); -#endif -template void Recv( float* buf, int count, int from, Comm comm ); -template void Recv( double* buf, int count, int from, Comm comm ); -template void Recv( Complex* buf, int count, int from, Comm comm ); -template void Recv( Complex* buf, int count, int from, Comm comm ); - -template -T TaggedRecv( int from, int tag, Comm comm ) -{ T b; TaggedRecv( &b, 1, from, tag, comm ); return b; } - -template byte TaggedRecv( int from, int tag, Comm comm ); -template int TaggedRecv( int from, int tag, Comm comm ); -template unsigned TaggedRecv( int from, int tag, Comm comm ); -template long int TaggedRecv( int from, int tag, Comm comm ); -template unsigned long TaggedRecv( int from, int tag, Comm comm ); +template void Recv (long long int *buf, int count, + int from, Comm comm); +template void Recv (unsigned long long *buf, int count, + int from, Comm comm); +#endif +template void Recv (float *buf, int count, int from, + Comm comm); +template void Recv (double *buf, int count, int from, + Comm comm); +template void Recv (Complex < float >*buf, int count, + int from, Comm comm); +template void Recv (Complex < double >*buf, int count, + int from, Comm comm); + +template < typename T > T TaggedRecv (int from, int tag, + Comm comm) +{ + T b; + + TaggedRecv (&b, 1, from, tag, comm); + return b; +} + +template byte TaggedRecv (int from, int tag, Comm comm); +template int TaggedRecv (int from, int tag, Comm comm); +template unsigned TaggedRecv (int from, int tag, + Comm comm); +template long int TaggedRecv (int from, int tag, + Comm comm); +template unsigned long TaggedRecv (int from, int tag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int TaggedRecv( int from, int tag, Comm comm ); -template unsigned long long TaggedRecv( int from, int tag, Comm comm ); -#endif -template float TaggedRecv( int from, int tag, Comm comm ); -template double TaggedRecv( int from, int tag, Comm comm ); -template Complex TaggedRecv( int from, int tag, Comm comm ); -template Complex TaggedRecv( int from, int tag, Comm comm ); - -template -T Recv( int from, Comm comm ) -{ return TaggedRecv( from, mpi::ANY_TAG, comm ); } - -template byte Recv( int from, Comm comm ); -template int Recv( int from, Comm comm ); -template unsigned Recv( int from, Comm comm ); -template long int Recv( int from, Comm comm ); -template unsigned long Recv( int from, Comm comm ); +template long long int TaggedRecv (int from, int tag, + Comm comm); +template unsigned long long TaggedRecv (int from, int tag, + Comm comm); +#endif +template float TaggedRecv (int from, int tag, Comm comm); +template double TaggedRecv (int from, int tag, Comm comm); +template Complex < float >TaggedRecv (int from, int tag, + Comm comm); +template Complex < double >TaggedRecv (int from, int tag, + Comm comm); + +template < typename T > T Recv (int from, Comm comm) +{ + return TaggedRecv < T > (from, mpi::ANY_TAG, comm); +} + +template byte Recv (int from, Comm comm); +template int Recv (int from, Comm comm); +template unsigned Recv (int from, Comm comm); +template long int Recv (int from, Comm comm); +template unsigned long Recv (int from, Comm comm); + #ifdef EL_HAVE_MPI_LONG_LONG -template long long int Recv( int from, Comm comm ); -template unsigned long long Recv( int from, Comm comm ); +template long long int Recv (int from, Comm comm); +template unsigned long long Recv (int from, Comm comm); #endif -template float Recv( int from, Comm comm ); -template double Recv( int from, Comm comm ); -template Complex Recv( int from, Comm comm ); -template Complex Recv( int from, Comm comm ); +template float Recv (int from, Comm comm); +template double Recv (int from, Comm comm); +template Complex < float >Recv (int from, Comm comm); +template Complex < double >Recv (int from, Comm comm); -template +template < typename R > void TaggedIRecv -( R* buf, int count, int from, int tag, Comm comm, Request& request ) +(R * buf, int count, int from, int tag, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IRecv")) SafeMpi - ( MPI_Irecv( buf, count, TypeMap(), from, tag, comm.comm, &request ) ); + (MPI_Irecv + (buf, count, TypeMap < R > (), from, tag, + comm.comm, &request)); } -template +template < typename R > void TaggedIRecv -( Complex* buf, int count, int from, int tag, Comm comm, Request& request ) +(Complex < R > *buf, int count, int from, int tag, + Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IRecv")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Irecv( buf, 2*count, TypeMap(), from, tag, comm.comm, &request ) ); + (MPI_Irecv + (buf, 2 * count, TypeMap < R > (), from, tag, + comm.comm, &request)); #else SafeMpi - ( MPI_Irecv - ( buf, count, TypeMap>(), from, tag, comm.comm, &request ) ); + (MPI_Irecv + (buf, count, TypeMap < Complex < R >> (), from, + tag, comm.comm, &request)); #endif } -template void TaggedIRecv( byte* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( int* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( unsigned* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( long int* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( unsigned long* buf, int count, int from, int tag, Comm comm, Request& request ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedIRecv( long long int* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( unsigned long long* buf, int count, int from, int tag, Comm comm, Request& request ); -#endif -template void TaggedIRecv( float* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( double* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( Complex* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( Complex* buf, int count, int from, int tag, Comm comm, Request& request ); - -template -void IRecv( T* buf, int count, int from, Comm comm, Request& request ) -{ TaggedIRecv( buf, count, from, mpi::ANY_TAG, comm, request ); } - -template void IRecv( byte* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( int* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( unsigned* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( long int* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( unsigned long* buf, int count, int from, Comm comm, Request& request ); +template void TaggedIRecv (byte * buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (int *buf, int count, int from, + int tag, Comm comm, + Request & request); +template void TaggedIRecv (unsigned *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (long int *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (unsigned long *buf, int count, + int from, int tag, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void IRecv( long long int* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( unsigned long long* buf, int count, int from, Comm comm, Request& request ); -#endif -template void IRecv( float* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( double* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( Complex* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( Complex* buf, int count, int from, Comm comm, Request& request ); - -template -T TaggedIRecv( int from, int tag, Comm comm, Request& request ) -{ T b; TaggedIRecv( &b, 1, from, tag, comm, request ); return b; } - -template byte TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template int TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template unsigned TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template long int TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template unsigned long TaggedIRecv( int from, int tag, Comm comm, Request& request ); +template void TaggedIRecv (long long int *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (unsigned long long *buf, + int count, int from, int tag, + Comm comm, Request & request); +#endif +template void TaggedIRecv (float *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (double *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (Complex < float >*buf, + int count, int from, int tag, + Comm comm, Request & request); +template void TaggedIRecv (Complex < double >*buf, + int count, int from, int tag, + Comm comm, Request & request); + +template < typename T > +void IRecv (T * buf, int count, int from, Comm comm, + Request & request) +{ + TaggedIRecv (buf, count, from, mpi::ANY_TAG, comm, + request); +} + +template void IRecv (byte * buf, int count, int from, + Comm comm, Request & request); +template void IRecv (int *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (unsigned *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (long int *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (unsigned long *buf, int count, + int from, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template unsigned long long TaggedIRecv( int from, int tag, Comm comm, Request& request ); -#endif -template float TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template double TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template Complex TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template Complex TaggedIRecv( int from, int tag, Comm comm, Request& request ); - -template -T IRecv( int from, Comm comm, Request& request ) -{ return TaggedIRecv( from, mpi::ANY_TAG, comm, request ); } - -template byte IRecv( int from, Comm comm, Request& request ); -template int IRecv( int from, Comm comm, Request& request ); -template unsigned IRecv( int from, Comm comm, Request& request ); -template long int IRecv( int from, Comm comm, Request& request ); -template unsigned long IRecv( int from, Comm comm, Request& request ); +template void IRecv (long long int *buf, int count, + int from, Comm comm, + Request & request); +template void IRecv (unsigned long long *buf, int count, + int from, Comm comm, + Request & request); +#endif +template void IRecv (float *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (double *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (Complex < float >*buf, int count, + int from, Comm comm, + Request & request); +template void IRecv (Complex < double >*buf, int count, + int from, Comm comm, + Request & request); + +template < typename T > +T TaggedIRecv (int from, int tag, Comm comm, + Request & request) +{ + T b; + + TaggedIRecv (&b, 1, from, tag, comm, request); + return b; +} + +template byte TaggedIRecv (int from, int tag, Comm comm, + Request & request); +template int TaggedIRecv (int from, int tag, Comm comm, + Request & request); +template unsigned TaggedIRecv (int from, int tag, + Comm comm, + Request & request); +template long int TaggedIRecv (int from, int tag, + Comm comm, + Request & request); +template unsigned long TaggedIRecv (int from, int tag, + Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int IRecv( int from, Comm comm, Request& request ); -template unsigned long long IRecv( int from, Comm comm, Request& request ); +template long long int TaggedIRecv (int from, int tag, + Comm comm, + Request & request); +template unsigned long long TaggedIRecv (int from, + int tag, + Comm comm, + Request & + request); #endif -template float IRecv( int from, Comm comm, Request& request ); -template double IRecv( int from, Comm comm, Request& request ); -template Complex IRecv( int from, Comm comm, Request& request ); -template Complex IRecv( int from, Comm comm, Request& request ); +template float TaggedIRecv (int from, int tag, Comm comm, + Request & request); +template double TaggedIRecv (int from, int tag, Comm comm, + Request & request); +template Complex < float >TaggedIRecv (int from, int tag, + Comm comm, + Request & request); +template Complex < double >TaggedIRecv (int from, int tag, + Comm comm, + Request & + request); + +template < typename T > +T IRecv (int from, Comm comm, Request & request) +{ + return TaggedIRecv < T > (from, mpi::ANY_TAG, comm, + request); +} -template +template byte IRecv (int from, Comm comm, + Request & request); +template int IRecv (int from, Comm comm, + Request & request); +template unsigned IRecv (int from, Comm comm, + Request & request); +template long int IRecv (int from, Comm comm, + Request & request); +template unsigned long IRecv (int from, Comm comm, + Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template long long int IRecv (int from, Comm comm, + Request & request); +template unsigned long long IRecv (int from, Comm comm, + Request & request); +#endif +template float IRecv (int from, Comm comm, + Request & request); +template double IRecv (int from, Comm comm, + Request & request); +template Complex < float >IRecv (int from, Comm comm, + Request & request); +template Complex < double >IRecv (int from, Comm comm, + Request & request); + +template < typename R > void TaggedSendRecv -( const R* sbuf, int sc, int to, int stag, - R* rbuf, int rc, int from, int rtag, Comm comm ) +(const R * sbuf, int sc, int to, int stag, + R * rbuf, int rc, int from, int rtag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::SendRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::SendRecv")) Status status; - SafeMpi - ( MPI_Sendrecv - ( const_cast(sbuf), sc, TypeMap(), to, stag, - rbuf, rc, TypeMap(), from, rtag, - comm.comm, &status ) ); + + SafeMpi (MPI_Sendrecv + (const_cast < R * >(sbuf), sc, + TypeMap < R > (), to, stag, rbuf, rc, + TypeMap < R > (), from, rtag, comm.comm, + &status)); } -template +template < typename R > void TaggedSendRecv -( const Complex* sbuf, int sc, int to, int stag, - Complex* rbuf, int rc, int from, int rtag, Comm comm ) +(const Complex < R > *sbuf, int sc, int to, int stag, + Complex < R > *rbuf, int rc, int from, int rtag, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::SendRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::SendRecv")) Status status; + #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Sendrecv - ( const_cast*>(sbuf), 2*sc, TypeMap(), to, stag, - rbuf, 2*rc, TypeMap(), from, rtag, - comm.comm, &status ) ); + (MPI_Sendrecv + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), to, stag, rbuf, 2 * rc, + TypeMap < R > (), from, rtag, comm.comm, + &status)); #else SafeMpi - ( MPI_Sendrecv - ( const_cast*>(sbuf), - sc, TypeMap>(), to, stag, - rbuf, - rc, TypeMap>(), from, rtag, comm.comm, &status ) ); + (MPI_Sendrecv + (const_cast < Complex < R > *>(sbuf), + sc, TypeMap < Complex < R >> (), to, stag, + rbuf, + rc, TypeMap < Complex < R >> (), from, rtag, + comm.comm, &status)); #endif } template void TaggedSendRecv -( const byte* sbuf, int sc, int to, int stag, - byte* rbuf, int rc, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( const int* sbuf, int sc, int to, int stag, - int* rbuf, int rc, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( const unsigned* sbuf, int sc, int to, int stag, - unsigned* rbuf, int rc, int from, int rtag, Comm comm ); +(const byte * sbuf, int sc, int to, int stag, + byte * rbuf, int rc, int from, int rtag, Comm comm); template void TaggedSendRecv -( const long int* sbuf, int sc, int to, int stag, - long int* rbuf, int rc, int from, int rtag, Comm comm ); +(const int *sbuf, int sc, int to, int stag, + int *rbuf, int rc, int from, int rtag, Comm comm); template void TaggedSendRecv -( const unsigned long* sbuf, int sc, int to, int stag, - unsigned long* rbuf, int rc, int from, int rtag, Comm comm ); +(const unsigned *sbuf, int sc, int to, int stag, + unsigned *rbuf, int rc, int from, int rtag, + Comm comm); +template void TaggedSendRecv (const long int *sbuf, + int sc, int to, int stag, + long int *rbuf, int rc, + int from, int rtag, + Comm comm); +template void TaggedSendRecv (const unsigned long *sbuf, + int sc, int to, int stag, + unsigned long *rbuf, int rc, + int from, int rtag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void TaggedSendRecv -( const long long int* sbuf, int sc, int to, int stag, - long long int* rbuf, int rc, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( const unsigned long long* sbuf, int sc, int to, int stag, - unsigned long long* rbuf, int rc, int from, int rtag, Comm comm ); +(const long long int *sbuf, int sc, int to, int stag, + long long int *rbuf, int rc, int from, int rtag, + Comm comm); +template void TaggedSendRecv (const unsigned long long + *sbuf, int sc, int to, + int stag, + unsigned long long *rbuf, + int rc, int from, int rtag, + Comm comm); #endif template void TaggedSendRecv -( const float* sbuf, int sc, int to, int stag, - float* rbuf, int rc, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( const double* sbuf, int sc, int to, int stag, - double* rbuf, int rc, int from, int rtag, Comm comm ); +(const float *sbuf, int sc, int to, int stag, + float *rbuf, int rc, int from, int rtag, Comm comm); template void TaggedSendRecv -( const Complex* sbuf, int sc, int to, int stag, - Complex* rbuf, int rc, int from, int rtag, Comm comm ); +(const double *sbuf, int sc, int to, int stag, + double *rbuf, int rc, int from, int rtag, Comm comm); template void TaggedSendRecv -( const Complex* sbuf, int sc, int to, int stag, - Complex* rbuf, int rc, int from, int rtag, Comm comm ); - -template +(const Complex < float >*sbuf, int sc, int to, + int stag, Complex < float >*rbuf, int rc, int from, + int rtag, Comm comm); +template void TaggedSendRecv (const Complex < + double >*sbuf, int sc, + int to, int stag, + Complex < double >*rbuf, + int rc, int from, int rtag, + Comm comm); + +template < typename T > void SendRecv -( const T* sbuf, int sc, int to, - T* rbuf, int rc, int from, Comm comm ) -{ TaggedSendRecv( sbuf, sc, to, 0, rbuf, rc, from, mpi::ANY_TAG, comm ); } +(const T * sbuf, int sc, int to, + T * rbuf, int rc, int from, Comm comm) +{ + TaggedSendRecv (sbuf, sc, to, 0, rbuf, rc, from, + mpi::ANY_TAG, comm); +} template void SendRecv -( const byte* sbuf, int sc, int to, - byte* rbuf, int rc, int from, Comm comm ); +(const byte * sbuf, int sc, int to, + byte * rbuf, int rc, int from, Comm comm); template void SendRecv -( const int* sbuf, int sc, int to, - int* rbuf, int rc, int from, Comm comm ); +(const int *sbuf, int sc, int to, + int *rbuf, int rc, int from, Comm comm); template void SendRecv -( const unsigned* sbuf, int sc, int to, - unsigned* rbuf, int rc, int from, Comm comm ); +(const unsigned *sbuf, int sc, int to, + unsigned *rbuf, int rc, int from, Comm comm); template void SendRecv -( const long int* sbuf, int sc, int to, - long int* rbuf, int rc, int from, Comm comm ); +(const long int *sbuf, int sc, int to, + long int *rbuf, int rc, int from, Comm comm); template void SendRecv -( const unsigned long* sbuf, int sc, int to, - unsigned long* rbuf, int rc, int from, Comm comm ); +(const unsigned long *sbuf, int sc, int to, + unsigned long *rbuf, int rc, int from, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void SendRecv -( const long long int* sbuf, int sc, int to, - long long int* rbuf, int rc, int from, Comm comm ); +(const long long int *sbuf, int sc, int to, + long long int *rbuf, int rc, int from, Comm comm); template void SendRecv -( const unsigned long long* sbuf, int sc, int to, - unsigned long long* rbuf, int rc, int from, Comm comm ); +(const unsigned long long *sbuf, int sc, int to, + unsigned long long *rbuf, int rc, int from, + Comm comm); #endif template void SendRecv -( const float* sbuf, int sc, int to, - float* rbuf, int rc, int from, Comm comm ); +(const float *sbuf, int sc, int to, + float *rbuf, int rc, int from, Comm comm); template void SendRecv -( const double* sbuf, int sc, int to, - double* rbuf, int rc, int from, Comm comm ); +(const double *sbuf, int sc, int to, + double *rbuf, int rc, int from, Comm comm); template void SendRecv -( const Complex* sbuf, int sc, int to, - Complex* rbuf, int rc, int from, Comm comm ); +(const Complex < float >*sbuf, int sc, int to, + Complex < float >*rbuf, int rc, int from, Comm comm); template void SendRecv -( const Complex* sbuf, int sc, int to, - Complex* rbuf, int rc, int from, Comm comm ); +(const Complex < double >*sbuf, int sc, int to, + Complex < double >*rbuf, int rc, int from, + Comm comm); + +template < typename T > +T TaggedSendRecv (T sb, int to, int stag, int from, + int rtag, Comm comm) +{ + T rb; -template -T TaggedSendRecv( T sb, int to, int stag, int from, int rtag, Comm comm ) -{ - T rb; - TaggedSendRecv( &sb, 1, to, stag, &rb, 1, from, rtag, comm ); - return rb; + TaggedSendRecv (&sb, 1, to, stag, &rb, 1, from, rtag, + comm); + return rb; } template byte TaggedSendRecv -( byte sb, int to, int stag, int from, int rtag, Comm comm ); -template int TaggedSendRecv -( int sb, int to, int stag, int from, int rtag, Comm comm ); -template unsigned TaggedSendRecv -( unsigned sb, int to, int stag, int from, int rtag, Comm comm ); -template long int TaggedSendRecv -( long int sb, int to, int stag, int from, int rtag, Comm comm ); -template unsigned long TaggedSendRecv -( unsigned long sb, int to, int stag, int from, int rtag, Comm comm ); +(byte sb, int to, int stag, int from, int rtag, + Comm comm); +template int TaggedSendRecv (int sb, int to, int stag, + int from, int rtag, + Comm comm); +template unsigned TaggedSendRecv (unsigned sb, int to, + int stag, int from, + int rtag, Comm comm); +template long int TaggedSendRecv (long int sb, int to, + int stag, int from, + int rtag, Comm comm); +template unsigned long TaggedSendRecv (unsigned long sb, + int to, int stag, + int from, int rtag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template long long int TaggedSendRecv -( long long int sb, int to, int stag, int from, int rtag, Comm comm ); -template unsigned long long TaggedSendRecv -( unsigned long long sb, int to, int stag, int from, int rtag, Comm comm ); +(long long int sb, int to, int stag, int from, + int rtag, Comm comm); +template unsigned long long TaggedSendRecv (unsigned long + long sb, + int to, + int stag, + int from, + int rtag, + Comm comm); #endif template float TaggedSendRecv -( float sb, int to, int stag, int from, int rtag, Comm comm ); -template double TaggedSendRecv -( double sb, int to, int stag, int from, int rtag, Comm comm ); -template Complex TaggedSendRecv -( Complex sb, int to, int stag, int from, int rtag, Comm comm ); -template Complex TaggedSendRecv -( Complex sb, int to, int stag, int from, int rtag, Comm comm ); - -template -T SendRecv( T sb, int to, int from, Comm comm ) -{ return TaggedSendRecv( sb, to, 0, from, mpi::ANY_TAG, comm ); } - -template byte SendRecv( byte sb, int to, int from, Comm comm ); -template int SendRecv( int sb, int to, int from, Comm comm ); -template unsigned SendRecv( unsigned sb, int to, int from, Comm comm ); -template long int SendRecv( long int sb, int to, int from, Comm comm ); -template unsigned long SendRecv( unsigned long sb, int to, int from, Comm comm ); +(float sb, int to, int stag, int from, int rtag, + Comm comm); +template double TaggedSendRecv (double sb, int to, + int stag, int from, + int rtag, Comm comm); +template Complex < float >TaggedSendRecv (Complex < + float >sb, + int to, + int stag, + int from, + int rtag, + Comm comm); +template Complex < double >TaggedSendRecv (Complex < + double >sb, + int to, + int stag, + int from, + int rtag, + Comm comm); + +template < typename T > +T SendRecv (T sb, int to, int from, Comm comm) +{ + return TaggedSendRecv (sb, to, 0, from, mpi::ANY_TAG, + comm); +} + +template byte SendRecv (byte sb, int to, int from, + Comm comm); +template int SendRecv (int sb, int to, int from, + Comm comm); +template unsigned SendRecv (unsigned sb, int to, int from, + Comm comm); +template long int SendRecv (long int sb, int to, int from, + Comm comm); +template unsigned long SendRecv (unsigned long sb, int to, + int from, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int SendRecv( long long int sb, int to, int from, Comm comm ); -template unsigned long long SendRecv( unsigned long long sb, int to, int from, Comm comm ); +template long long int SendRecv (long long int sb, int to, + int from, Comm comm); +template unsigned long long SendRecv (unsigned long long + sb, int to, + int from, + Comm comm); #endif -template float SendRecv( float sb, int to, int from, Comm comm ); -template double SendRecv( double sb, int to, int from, Comm comm ); -template Complex SendRecv -( Complex sb, int to, int from, Comm comm ); -template Complex SendRecv -( Complex sb, int to, int from, Comm comm ); - -template +template float SendRecv (float sb, int to, int from, + Comm comm); +template double SendRecv (double sb, int to, int from, + Comm comm); +template Complex < float >SendRecv (Complex < float >sb, + int to, int from, + Comm comm); +template Complex < double >SendRecv (Complex < double >sb, + int to, int from, + Comm comm); + +template < typename R > void TaggedSendRecv -( R* buf, int count, int to, int stag, int from, int rtag, Comm comm ) +(R * buf, int count, int to, int stag, int from, + int rtag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::SendRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::SendRecv")) Status status; - SafeMpi - ( MPI_Sendrecv_replace - ( buf, count, TypeMap(), to, stag, from, rtag, comm.comm, &status ) ); + + SafeMpi (MPI_Sendrecv_replace + (buf, count, TypeMap < R > (), to, stag, + from, rtag, comm.comm, &status)); } -template +template < typename R > void TaggedSendRecv -( Complex* buf, int count, int to, int stag, int from, int rtag, Comm comm ) +(Complex < R > *buf, int count, int to, int stag, + int from, int rtag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::SendRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::SendRecv")) Status status; + #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Sendrecv_replace - ( buf, 2*count, TypeMap(), to, stag, from, rtag, comm.comm, - &status ) ); + (MPI_Sendrecv_replace + (buf, 2 * count, TypeMap < R > (), to, stag, + from, rtag, comm.comm, &status)); #else SafeMpi - ( MPI_Sendrecv_replace - ( buf, count, TypeMap>(), - to, stag, from, rtag, comm.comm, &status ) ); + (MPI_Sendrecv_replace + (buf, count, TypeMap < Complex < R >> (), + to, stag, from, rtag, comm.comm, &status)); #endif } template void TaggedSendRecv -( byte* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( int* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( unsigned* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( long int* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( unsigned long* buf, int count, int to, int stag, int from, int rtag, Comm comm ); +(byte * buf, int count, int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (int *buf, int count, int to, + int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (unsigned *buf, int count, + int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (long int *buf, int count, + int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (unsigned long *buf, + int count, int to, int stag, + int from, int rtag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void TaggedSendRecv -( long long int* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( unsigned long long* buf, int count, int to, int stag, int from, int rtag, Comm comm ); +(long long int *buf, int count, int to, int stag, + int from, int rtag, Comm comm); +template void TaggedSendRecv (unsigned long long *buf, + int count, int to, int stag, + int from, int rtag, + Comm comm); #endif template void TaggedSendRecv -( float* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( double* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( Complex* buf, int count, int to, int stag, - int from, int rtag, Comm comm ); -template void TaggedSendRecv -( Complex* buf, int count, int to, int stag, - int from, int rtag, Comm comm ); - -template -void SendRecv( T* buf, int count, int to, int from, Comm comm ) -{ TaggedSendRecv( buf, count, to, 0, from, mpi::ANY_TAG, comm ); } +(float *buf, int count, int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (double *buf, int count, + int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (Complex < float >*buf, + int count, int to, int stag, + int from, int rtag, + Comm comm); +template void TaggedSendRecv (Complex < double >*buf, + int count, int to, int stag, + int from, int rtag, + Comm comm); + +template < typename T > +void SendRecv (T * buf, int count, int to, int from, + Comm comm) +{ + TaggedSendRecv (buf, count, to, 0, from, mpi::ANY_TAG, + comm); +} template void SendRecv -( byte* buf, int count, int to, int from, Comm comm ); -template void SendRecv -( int* buf, int count, int to, int from, Comm comm ); +(byte * buf, int count, int to, int from, Comm comm); template void SendRecv -( unsigned* buf, int count, int to, int from, Comm comm ); +(int *buf, int count, int to, int from, Comm comm); template void SendRecv -( long int* buf, int count, int to, int from, Comm comm ); -template void SendRecv -( unsigned long* buf, int count, int to, int from, Comm comm ); +(unsigned *buf, int count, int to, int from, + Comm comm); +template void SendRecv (long int *buf, int count, int to, + int from, Comm comm); +template void SendRecv (unsigned long *buf, int count, + int to, int from, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void SendRecv -( long long int* buf, int count, int to, int from, Comm comm ); -template void SendRecv -( unsigned long long* buf, int count, int to, int from, Comm comm ); +(long long int *buf, int count, int to, int from, + Comm comm); +template void SendRecv (unsigned long long *buf, + int count, int to, int from, + Comm comm); #endif template void SendRecv -( float* buf, int count, int to, int from, Comm comm ); +(float *buf, int count, int to, int from, Comm comm); template void SendRecv -( double* buf, int count, int to, int from, Comm comm ); +(double *buf, int count, int to, int from, Comm comm); template void SendRecv -( Complex* buf, int count, int to, int from, Comm comm ); -template void SendRecv -( Complex* buf, int count, int to, int from, Comm comm ); - -template -void Broadcast( R* buf, int count, int root, Comm comm ) +(Complex < float >*buf, int count, int to, int from, + Comm comm); +template void SendRecv (Complex < double >*buf, int count, + int to, int from, Comm comm); + +template < typename R > +void Broadcast (R * buf, int count, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Broadcast")) - SafeMpi( MPI_Bcast( buf, count, TypeMap(), root, comm.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Broadcast")) + SafeMpi (MPI_Bcast + (buf, count, TypeMap < R > (), root, + comm.comm)); } -template -void Broadcast( Complex* buf, int count, int root, Comm comm ) +template < typename R > +void Broadcast (Complex < R > *buf, int count, + int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Broadcast")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Broadcast")) #ifdef EL_AVOID_COMPLEX_MPI - SafeMpi( MPI_Bcast( buf, 2*count, TypeMap(), root, comm.comm ) ); + SafeMpi (MPI_Bcast + (buf, 2 * count, TypeMap < R > (), root, + comm.comm)); #else - SafeMpi( MPI_Bcast( buf, count, TypeMap>(), root, comm.comm ) ); + SafeMpi (MPI_Bcast + (buf, count, TypeMap < Complex < R >> (), + root, comm.comm)); #endif } -template void Broadcast( byte* buf, int count, int root, Comm comm ); -template void Broadcast( int* buf, int count, int root, Comm comm ); -template void Broadcast( unsigned* buf, int count, int root, Comm comm ); -template void Broadcast( long int* buf, int count, int root, Comm comm ); -template void Broadcast( unsigned long* buf, int count, int root, Comm comm ); +template void Broadcast (byte * buf, int count, int root, + Comm comm); +template void Broadcast (int *buf, int count, int root, + Comm comm); +template void Broadcast (unsigned *buf, int count, + int root, Comm comm); +template void Broadcast (long int *buf, int count, + int root, Comm comm); +template void Broadcast (unsigned long *buf, int count, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Broadcast( long long int* buf, int count, int root, Comm comm ); -template void Broadcast( unsigned long long* buf, int count, int root, Comm comm ); -#endif -template void Broadcast( float* buf, int count, int root, Comm comm ); -template void Broadcast( double* buf, int count, int root, Comm comm ); -template void Broadcast( Complex* buf, int count, int root, Comm comm ); -template void Broadcast( Complex* buf, int count, int root, Comm comm ); - -template -void Broadcast( T& b, int root, Comm comm ) -{ Broadcast( &b, 1, root, comm ); } - -template void Broadcast( byte& b, int root, Comm comm ); -template void Broadcast( int& b, int root, Comm comm ); -template void Broadcast( unsigned& b, int root, Comm comm ); -template void Broadcast( long int& b, int root, Comm comm ); -template void Broadcast( unsigned long& b, int root, Comm comm ); +template void Broadcast (long long int *buf, int count, + int root, Comm comm); +template void Broadcast (unsigned long long *buf, + int count, int root, Comm comm); +#endif +template void Broadcast (float *buf, int count, int root, + Comm comm); +template void Broadcast (double *buf, int count, int root, + Comm comm); +template void Broadcast (Complex < float >*buf, int count, + int root, Comm comm); +template void Broadcast (Complex < double >*buf, + int count, int root, Comm comm); + +template < typename T > void Broadcast (T & b, int root, + Comm comm) +{ + Broadcast (&b, 1, root, comm); +} + +template void Broadcast (byte & b, int root, Comm comm); +template void Broadcast (int &b, int root, Comm comm); +template void Broadcast (unsigned &b, int root, + Comm comm); +template void Broadcast (long int &b, int root, + Comm comm); +template void Broadcast (unsigned long &b, int root, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Broadcast( long long int& b, int root, Comm comm ); -template void Broadcast( unsigned long long& b, int root, Comm comm ); +template void Broadcast (long long int &b, int root, + Comm comm); +template void Broadcast (unsigned long long &b, int root, + Comm comm); #endif -template void Broadcast( float& b, int root, Comm comm ); -template void Broadcast( double& b, int root, Comm comm ); -template void Broadcast( Complex& b, int root, Comm comm ); -template void Broadcast( Complex& b, int root, Comm comm ); +template void Broadcast (float &b, int root, Comm comm); +template void Broadcast (double &b, int root, Comm comm); +template void Broadcast (Complex < float >&b, int root, + Comm comm); +template void Broadcast (Complex < double >&b, int root, + Comm comm); #ifdef EL_HAVE_NONBLOCKING_COLLECTIVES -template -void IBroadcast( R* buf, int count, int root, Comm comm, Request& request ) +template < typename R > +void IBroadcast (R * buf, int count, int root, + Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IBroadcast")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IBroadcast")) SafeMpi - ( MPI_Ibcast( buf, count, TypeMap(), root, comm.comm, &request ) ); + (MPI_Ibcast + (buf, count, TypeMap < R > (), root, comm.comm, + &request)); } -template +template < typename R > void IBroadcast -( Complex* buf, int count, int root, Comm comm, Request& request ) +(Complex < R > *buf, int count, int root, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IBroadcast")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IBroadcast")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Ibcast( buf, 2*count, TypeMap(), root, comm.comm, &request ) ); + (MPI_Ibcast + (buf, 2 * count, TypeMap < R > (), root, + comm.comm, &request)); #else SafeMpi - ( MPI_Ibcast - ( buf, count, TypeMap>(), root, comm.comm, &request ) ); + (MPI_Ibcast + (buf, count, TypeMap < Complex < R >> (), root, + comm.comm, &request)); #endif } -template void IBroadcast( byte* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( int* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( long int* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned long* buf, int count, int root, Comm comm, Request& request ); +template void IBroadcast (byte * buf, int count, int root, + Comm comm, Request & request); +template void IBroadcast (int *buf, int count, int root, + Comm comm, Request & request); +template void IBroadcast (unsigned *buf, int count, + int root, Comm comm, + Request & request); +template void IBroadcast (long int *buf, int count, + int root, Comm comm, + Request & request); +template void IBroadcast (unsigned long *buf, int count, + int root, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void IBroadcast( long long int* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned long long* buf, int count, int root, Comm comm, Request& request ); -#endif -template void IBroadcast( float* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( double* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( Complex* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( Complex* buf, int count, int root, Comm comm, Request& request ); - -template -void IBroadcast( T& b, int root, Comm comm, Request& request ) -{ IBroadcast( &b, 1, root, comm, request ); } - -template void IBroadcast( byte& b, int root, Comm comm, Request& request ); -template void IBroadcast( int& b, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned& b, int root, Comm comm, Request& request ); -template void IBroadcast( long int& b, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned long& b, int root, Comm comm, Request& request ); +template void IBroadcast (long long int *buf, int count, + int root, Comm comm, + Request & request); +template void IBroadcast (unsigned long long *buf, + int count, int root, Comm comm, + Request & request); +#endif +template void IBroadcast (float *buf, int count, int root, + Comm comm, Request & request); +template void IBroadcast (double *buf, int count, + int root, Comm comm, + Request & request); +template void IBroadcast (Complex < float >*buf, + int count, int root, Comm comm, + Request & request); +template void IBroadcast (Complex < double >*buf, + int count, int root, Comm comm, + Request & request); + +template < typename T > +void IBroadcast (T & b, int root, Comm comm, + Request & request) +{ + IBroadcast (&b, 1, root, comm, request); +} + +template void IBroadcast (byte & b, int root, Comm comm, + Request & request); +template void IBroadcast (int &b, int root, Comm comm, + Request & request); +template void IBroadcast (unsigned &b, int root, + Comm comm, Request & request); +template void IBroadcast (long int &b, int root, + Comm comm, Request & request); +template void IBroadcast (unsigned long &b, int root, + Comm comm, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void IBroadcast( long long int& b, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned long long& b, int root, Comm comm, Request& request ); +template void IBroadcast (long long int &b, int root, + Comm comm, Request & request); +template void IBroadcast (unsigned long long &b, int root, + Comm comm, Request & request); #endif -template void IBroadcast( float& b, int root, Comm comm, Request& request ); -template void IBroadcast( double& b, int root, Comm comm, Request& request ); -template void IBroadcast( Complex& b, int root, Comm comm, Request& request ); -template void IBroadcast( Complex& b, int root, Comm comm, Request& request ); +template void IBroadcast (float &b, int root, Comm comm, + Request & request); +template void IBroadcast (double &b, int root, Comm comm, + Request & request); +template void IBroadcast (Complex < float >&b, int root, + Comm comm, Request & request); +template void IBroadcast (Complex < double >&b, int root, + Comm comm, Request & request); #endif // ifdef EL_HAVE_NONBLOCKING_COLLECTIVES -template +template < typename R > void Gather -( const R* sbuf, int sc, - R* rbuf, int rc, int root, Comm comm ) +(const R * sbuf, int sc, R * rbuf, int rc, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Gather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Gather")) SafeMpi - ( MPI_Gather - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), root, comm.comm ) ); + (MPI_Gather + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), root, comm.comm)); } -template +template < typename R > void Gather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Gather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Gather")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Gather - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), root, comm.comm ) ); + (MPI_Gather + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), root, comm.comm)); #else SafeMpi - ( MPI_Gather - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), - root, comm.comm ) ); + (MPI_Gather + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), root, comm.comm)); #endif } -template void Gather( const byte* sbuf, int sc, byte* rbuf, int rc, int root, Comm comm ); -template void Gather( const int* sbuf, int sc, int* rbuf, int rc, int root, Comm comm ); -template void Gather( const unsigned* sbuf, int sc, unsigned* rbuf, int rc, int root, Comm comm ); -template void Gather( const long int* sbuf, int sc, long int* rbuf, int rc, int root, Comm comm ); -template void Gather( const unsigned long* sbuf, int sc, unsigned long* rbuf, int rc, int root, Comm comm ); +template void Gather (const byte * sbuf, int sc, + byte * rbuf, int rc, int root, + Comm comm); +template void Gather (const int *sbuf, int sc, int *rbuf, + int rc, int root, Comm comm); +template void Gather (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, int root, + Comm comm); +template void Gather (const long int *sbuf, int sc, + long int *rbuf, int rc, int root, + Comm comm); +template void Gather (const unsigned long *sbuf, int sc, + unsigned long *rbuf, int rc, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Gather( const long long int* sbuf, int sc, long long int* rbuf, int rc, int root, Comm comm ); -template void Gather( const unsigned long long* sbuf, int sc, unsigned long long* rbuf, int rc, int root, Comm comm ); +template void Gather (const long long int *sbuf, int sc, + long long int *rbuf, int rc, + int root, Comm comm); +template void Gather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + int rc, int root, Comm comm); #endif -template void Gather( const float* sbuf, int sc, float* rbuf, int rc, int root, Comm comm ); -template void Gather( const double* sbuf, int sc, double* rbuf, int rc, int root, Comm comm ); -template void Gather( const Complex* sbuf, int sc, Complex* rbuf, int rc, int root, Comm comm ); -template void Gather( const Complex* sbuf, int sc, Complex* rbuf, int rc, int root, Comm comm ); +template void Gather (const float *sbuf, int sc, + float *rbuf, int rc, int root, + Comm comm); +template void Gather (const double *sbuf, int sc, + double *rbuf, int rc, int root, + Comm comm); +template void Gather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + int rc, int root, Comm comm); +template void Gather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + int rc, int root, Comm comm); #ifdef EL_HAVE_NONBLOCKING_COLLECTIVES -template +template < typename R > void IGather -( const R* sbuf, int sc, - R* rbuf, int rc, int root, Comm comm, Request& request ) +(const R * sbuf, int sc, + R * rbuf, int rc, int root, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IGather")) SafeMpi - ( MPI_Igather - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), root, comm.comm, &request ) ); + (MPI_Igather + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), root, comm.comm, + &request)); } -template +template < typename R > void IGather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm, Request& request ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, int root, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IGather")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Igather - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), - root, comm.comm, &request ) ); + (MPI_Igather + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), root, comm.comm, &request)); #else SafeMpi - ( MPI_Igather - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), - root, comm.comm, &request ) ); + (MPI_Igather + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), root, comm.comm, + &request)); #endif } template void IGather -( const byte* sbuf, int sc, - byte* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const int* sbuf, int sc, - int* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const unsigned* sbuf, int sc, - unsigned* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const long int* sbuf, int sc, - long int* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, int rc, int root, Comm comm, Request& request ); +(const byte * sbuf, int sc, + byte * rbuf, int rc, int root, Comm comm, + Request & request); +template void IGather (const int *sbuf, int sc, int *rbuf, + int rc, int root, Comm comm, + Request & request); +template void IGather (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, int root, + Comm comm, Request & request); +template void IGather (const long int *sbuf, int sc, + long int *rbuf, int rc, int root, + Comm comm, Request & request); +template void IGather (const unsigned long *sbuf, int sc, + unsigned long *rbuf, int rc, + int root, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG template void IGather -( const long long int* sbuf, int sc, - long long int* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, int rc, int root, Comm comm, Request& request ); +(const long long int *sbuf, int sc, + long long int *rbuf, int rc, int root, Comm comm, + Request & request); +template void IGather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + int rc, int root, Comm comm, + Request & request); #endif template void IGather -( const float* sbuf, int sc, - float* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const double* sbuf, int sc, - double* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm, Request& request ); +(const float *sbuf, int sc, + float *rbuf, int rc, int root, Comm comm, + Request & request); +template void IGather (const double *sbuf, int sc, + double *rbuf, int rc, int root, + Comm comm, Request & request); +template void IGather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + int rc, int root, Comm comm, + Request & request); +template void IGather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + int rc, int root, Comm comm, + Request & request); #endif // ifdef EL_HAVE_NONBLOCKING_COLLECTIVES -template +template < typename R > void Gather -( const R* sbuf, int sc, - R* rbuf, const int* rcs, const int* rds, int root, Comm comm ) +(const R * sbuf, int sc, + R * rbuf, const int *rcs, const int *rds, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Gather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Gather")) SafeMpi - ( MPI_Gatherv - ( const_cast(sbuf), - sc, - TypeMap(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap(), - root, - comm.comm ) ); -} - -template + (MPI_Gatherv + (const_cast < R * >(sbuf), + sc, + TypeMap < R > (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), TypeMap < R > (), + root, comm.comm)); +} + +template < typename R > void Gather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, int root, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, const int *rcs, const int *rds, + int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Gather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Gather")) #ifdef EL_AVOID_COMPLEX_MPI - const int commRank = Rank( comm ); - const int commSize = Size( comm ); - std::vector rcsDouble, rdsDouble; - if( commRank == root ) + const int commRank = Rank (comm); + const int commSize = Size (comm); + std::vector < int >rcsDouble, rdsDouble; + + if (commRank == root) { - rcsDouble.resize( commSize ); - rdsDouble.resize( commSize ); - for( int i=0; i*>(sbuf), 2*sc, TypeMap(), - rbuf, rcsDouble.data(), rdsDouble.data(), TypeMap(), - root, comm.comm ) ); + (MPI_Gatherv + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, rcsDouble.data (), + rdsDouble.data (), TypeMap < R > (), root, + comm.comm)); #else SafeMpi - ( MPI_Gatherv - ( const_cast*>(sbuf), - sc, - TypeMap>(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap>(), - root, - comm.comm ) ); + (MPI_Gatherv + (const_cast < Complex < R > *>(sbuf), + sc, + TypeMap < Complex < R >> (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), + TypeMap < Complex < R >> (), root, + comm.comm)); #endif } template void Gather -( const byte* sbuf, int sc, - byte* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const int* sbuf, int sc, - int* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const unsigned* sbuf, int sc, - unsigned* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const long int* sbuf, int sc, - long int* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, const int* rcs, const int* rds, int root, Comm comm ); +(const byte * sbuf, int sc, + byte * rbuf, const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const int *sbuf, int sc, int *rbuf, + const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const unsigned *sbuf, int sc, + unsigned *rbuf, const int *rcs, + const int *rds, int root, + Comm comm); +template void Gather (const long int *sbuf, int sc, + long int *rbuf, const int *rcs, + const int *rds, int root, + Comm comm); +template void Gather (const unsigned long *sbuf, int sc, + unsigned long *rbuf, const int *rcs, + const int *rds, int root, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void Gather -( const long long int* sbuf, int sc, - long long int* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, const int* rcs, const int* rds, int root, Comm comm ); +(const long long int *sbuf, int sc, + long long int *rbuf, const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + const int *rcs, const int *rds, + int root, Comm comm); #endif template void Gather -( const float* sbuf, int sc, - float* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const double* sbuf, int sc, - double* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, - int root, Comm comm ); -template void Gather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, - int root, Comm comm ); - -template +(const float *sbuf, int sc, + float *rbuf, const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const double *sbuf, int sc, + double *rbuf, const int *rcs, + const int *rds, int root, + Comm comm); +template void Gather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + const int *rcs, const int *rds, + int root, Comm comm); + +template < typename R > void AllGather -( const R* sbuf, int sc, - R* rbuf, int rc, Comm comm ) +(const R * sbuf, int sc, R * rbuf, int rc, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllGather")) #ifdef EL_USE_BYTE_ALLGATHERS SafeMpi - ( MPI_Allgather - ( (UCP)const_cast(sbuf), sizeof(R)*sc, MPI_UNSIGNED_CHAR, - (UCP)rbuf, sizeof(R)*rc, MPI_UNSIGNED_CHAR, - comm.comm ) ); + (MPI_Allgather + ((UCP) const_cast < R * >(sbuf), sizeof (R) * sc, + MPI_UNSIGNED_CHAR, (UCP) rbuf, sizeof (R) * rc, + MPI_UNSIGNED_CHAR, comm.comm)); #else SafeMpi - ( MPI_Allgather - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), comm.comm ) ); + (MPI_Allgather + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), comm.comm)); #endif } -template +template < typename R > void AllGather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllGather")) #ifdef EL_USE_BYTE_ALLGATHERS SafeMpi - ( MPI_Allgather - ( (UCP)const_cast*>(sbuf), 2*sizeof(R)*sc, MPI_UNSIGNED_CHAR, - (UCP)rbuf, 2*sizeof(R)*rc, MPI_UNSIGNED_CHAR, - comm.comm ) ); + (MPI_Allgather + ((UCP) const_cast < Complex < R > *>(sbuf), + 2 * sizeof (R) * sc, MPI_UNSIGNED_CHAR, + (UCP) rbuf, 2 * sizeof (R) * rc, + MPI_UNSIGNED_CHAR, comm.comm)); #else - #ifdef EL_AVOID_COMPLEX_MPI +#ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Allgather - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), comm.comm ) ); - #else + (MPI_Allgather + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), comm.comm)); +#else SafeMpi - ( MPI_Allgather - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), comm.comm ) ); - #endif + (MPI_Allgather + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), comm.comm)); +#endif #endif } -template void AllGather( const byte* sbuf, int sc, byte* rbuf, int rc, Comm comm ); -template void AllGather( const int* sbuf, int sc, int* rbuf, int rc, Comm comm ); -template void AllGather( const unsigned* sbuf, int sc, unsigned* rbuf, int rc, Comm comm ); -template void AllGather( const long int* sbuf, int sc, long int* rbuf, int rc, Comm comm ); -template void AllGather( const unsigned long* sbuf, int sc, unsigned long* rbuf, int rc, Comm comm ); +template void AllGather (const byte * sbuf, int sc, + byte * rbuf, int rc, Comm comm); +template void AllGather (const int *sbuf, int sc, + int *rbuf, int rc, Comm comm); +template void AllGather (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, + Comm comm); +template void AllGather (const long int *sbuf, int sc, + long int *rbuf, int rc, + Comm comm); +template void AllGather (const unsigned long *sbuf, + int sc, unsigned long *rbuf, + int rc, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void AllGather( const long long int* sbuf, int sc, long long int* rbuf, int rc, Comm comm ); -template void AllGather( const unsigned long long* sbuf, int sc, unsigned long long* rbuf, int rc, Comm comm ); +template void AllGather (const long long int *sbuf, + int sc, long long int *rbuf, + int rc, Comm comm); +template void AllGather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + int rc, Comm comm); #endif -template void AllGather( const float* sbuf, int sc, float* rbuf, int rc, Comm comm ); -template void AllGather( const double* sbuf, int sc, double* rbuf, int rc, Comm comm ); -template void AllGather( const Complex* sbuf, int sc, Complex* rbuf, int rc, Comm comm ); -template void AllGather( const Complex* sbuf, int sc, Complex* rbuf, int rc, Comm comm ); - -template +template void AllGather (const float *sbuf, int sc, + float *rbuf, int rc, Comm comm); +template void AllGather (const double *sbuf, int sc, + double *rbuf, int rc, Comm comm); +template void AllGather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + int rc, Comm comm); +template void AllGather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + int rc, Comm comm); + +template < typename R > void AllGather -( const R* sbuf, int sc, - R* rbuf, const int* rcs, const int* rds, Comm comm ) +(const R * sbuf, int sc, + R * rbuf, const int *rcs, const int *rds, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllGather")) #ifdef EL_USE_BYTE_ALLGATHERS - const int commSize = Size( comm ); - std::vector byteRcs( commSize ), byteRds( commSize ); - for( int i=0; ibyteRcs (commSize), + byteRds (commSize); + for (int i = 0; i < commSize; ++i) { - byteRcs[i] = sizeof(R)*rcs[i]; - byteRds[i] = sizeof(R)*rds[i]; + byteRcs[i] = sizeof (R) * rcs[i]; + byteRds[i] = sizeof (R) * rds[i]; } SafeMpi - ( MPI_Allgatherv - ( (UCP)const_cast(sbuf), sizeof(R)*sc, MPI_UNSIGNED_CHAR, - (UCP)rbuf, byteRcs.data(), byteRds.data(), MPI_UNSIGNED_CHAR, - comm.comm ) ); + (MPI_Allgatherv + ((UCP) const_cast < R * >(sbuf), sizeof (R) * sc, + MPI_UNSIGNED_CHAR, (UCP) rbuf, byteRcs.data (), + byteRds.data (), MPI_UNSIGNED_CHAR, comm.comm)); #else SafeMpi - ( MPI_Allgatherv - ( const_cast(sbuf), - sc, - TypeMap(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap(), - comm.comm ) ); + (MPI_Allgatherv + (const_cast < R * >(sbuf), + sc, + TypeMap < R > (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), TypeMap < R > (), + comm.comm)); #endif } -template +template < typename R > void AllGather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, const int *rcs, const int *rds, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllGather")) #ifdef EL_USE_BYTE_ALLGATHERS - const int commSize = Size( comm ); - std::vector byteRcs( commSize ), byteRds( commSize ); - for( int i=0; ibyteRcs (commSize), + byteRds (commSize); + for (int i = 0; i < commSize; ++i) { - byteRcs[i] = 2*sizeof(R)*rcs[i]; - byteRds[i] = 2*sizeof(R)*rds[i]; + byteRcs[i] = 2 * sizeof (R) * rcs[i]; + byteRds[i] = 2 * sizeof (R) * rds[i]; } SafeMpi - ( MPI_Allgatherv - ( (UCP)const_cast*>(sbuf), 2*sizeof(R)*sc, MPI_UNSIGNED_CHAR, - (UCP)rbuf, byteRcs.data(), byteRds.data(), MPI_UNSIGNED_CHAR, - comm.comm ) ); + (MPI_Allgatherv + ((UCP) const_cast < Complex < R > *>(sbuf), + 2 * sizeof (R) * sc, MPI_UNSIGNED_CHAR, + (UCP) rbuf, byteRcs.data (), byteRds.data (), + MPI_UNSIGNED_CHAR, comm.comm)); #else - #ifdef EL_AVOID_COMPLEX_MPI - const int commSize = Size( comm ); - std::vector realRcs( commSize ), realRds( commSize ); - for( int i=0; irealRcs (commSize), + realRds (commSize); + for (int i = 0; i < commSize; ++i) { - realRcs[i] = 2*rcs[i]; - realRds[i] = 2*rds[i]; + realRcs[i] = 2 * rcs[i]; + realRds[i] = 2 * rds[i]; } SafeMpi - ( MPI_Allgatherv - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, realRcs.data(), realRds.data(), TypeMap(), comm.comm ) ); - #else + (MPI_Allgatherv + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, realRcs.data (), + realRds.data (), TypeMap < R > (), comm.comm)); +#else SafeMpi - ( MPI_Allgatherv - ( const_cast*>(sbuf), - sc, - TypeMap>(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap>(), - comm.comm ) ); - #endif + (MPI_Allgatherv + (const_cast < Complex < R > *>(sbuf), + sc, + TypeMap < Complex < R >> (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), + TypeMap < Complex < R >> (), comm.comm)); +#endif #endif } template void AllGather -( const byte* sbuf, int sc, - byte* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const int* sbuf, int sc, - int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const unsigned* sbuf, int sc, - unsigned* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const long int* sbuf, int sc, - long int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, const int* rcs, const int* rds, Comm comm ); +(const byte * sbuf, int sc, + byte * rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllGather (const int *sbuf, int sc, + int *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllGather (const unsigned *sbuf, int sc, + unsigned *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllGather (const long int *sbuf, int sc, + long int *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllGather (const unsigned long *sbuf, + int sc, unsigned long *rbuf, + const int *rcs, const int *rds, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void AllGather -( const long long int* sbuf, int sc, - long long int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, const int* rcs, const int* rds, Comm comm ); +(const long long int *sbuf, int sc, + long long int *rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllGather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + const int *rcs, const int *rds, + Comm comm); #endif template void AllGather -( const float* sbuf, int sc, - float* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const double* sbuf, int sc, - double* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ); - -template +(const float *sbuf, int sc, + float *rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllGather (const double *sbuf, int sc, + double *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllGather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + const int *rcs, const int *rds, + Comm comm); +template void AllGather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + const int *rcs, const int *rds, + Comm comm); + +template < typename R > void Scatter -( const R* sbuf, int sc, - R* rbuf, int rc, int root, Comm comm ) +(const R * sbuf, int sc, R * rbuf, int rc, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Scatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Scatter")) SafeMpi - ( MPI_Scatter - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), root, comm.comm)); } -template +template < typename R > void Scatter -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Scatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Scatter")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Scatter - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), root, comm.comm)); #else SafeMpi - ( MPI_Scatter - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), - root, comm.comm ) ); + (MPI_Scatter + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), root, comm.comm)); #endif } template void Scatter -( const byte* sbuf, int sc, - byte* rbuf, int rc, int root, Comm comm ); -template void Scatter -( const int* sbuf, int sc, - int* rbuf, int rc, int root, Comm comm ); +(const byte * sbuf, int sc, + byte * rbuf, int rc, int root, Comm comm); template void Scatter -( const unsigned* sbuf, int sc, - unsigned* rbuf, int rc, int root, Comm comm ); -template void Scatter -( const long int* sbuf, int sc, - long int* rbuf, int rc, int root, Comm comm ); -template void Scatter -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, int rc, int root, Comm comm ); +(const int *sbuf, int sc, int *rbuf, int rc, int root, + Comm comm); +template void Scatter (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, int root, + Comm comm); +template void Scatter (const long int *sbuf, int sc, + long int *rbuf, int rc, int root, + Comm comm); +template void Scatter (const unsigned long *sbuf, int sc, + unsigned long *rbuf, int rc, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void Scatter -( const long long int* sbuf, int sc, - long long int* rbuf, int rc, int root, Comm comm ); +(const long long int *sbuf, int sc, + long long int *rbuf, int rc, int root, Comm comm); template void Scatter -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, int rc, int root, Comm comm ); +(const unsigned long long *sbuf, int sc, + unsigned long long *rbuf, int rc, int root, + Comm comm); #endif template void Scatter -( const float* sbuf, int sc, - float* rbuf, int rc, int root, Comm comm ); +(const float *sbuf, int sc, + float *rbuf, int rc, int root, Comm comm); template void Scatter -( const double* sbuf, int sc, - double* rbuf, int rc, int root, Comm comm ); +(const double *sbuf, int sc, + double *rbuf, int rc, int root, Comm comm); template void Scatter -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm ); +(const Complex < float >*sbuf, int sc, + Complex < float >*rbuf, int rc, int root, Comm comm); template void Scatter -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm ); +(const Complex < double >*sbuf, int sc, + Complex < double >*rbuf, int rc, int root, + Comm comm); -template -void Scatter( R* buf, int sc, int rc, int root, Comm comm ) +template < typename R > +void Scatter (R * buf, int sc, int rc, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Scatter")) - const int commRank = Rank( comm ); - if( commRank == root ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Scatter")) + const int commRank = Rank (comm); + + if (commRank == root) { #ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Scatter - ( buf, sc, TypeMap(), - MPI_IN_PLACE, rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (buf, sc, TypeMap < R > (), + MPI_IN_PLACE, rc, TypeMap < R > (), root, + comm.comm)); #else - const int commSize = Size( comm ); - std::vector sendBuf( sc*commSize ); - MemCopy( sendBuf.data(), buf, sc*commSize ); + const int commSize = Size (comm); + + std::vector < R > sendBuf (sc * commSize); + MemCopy (sendBuf.data (), buf, sc * commSize); SafeMpi - ( MPI_Scatter - ( sendBuf.data(), sc, TypeMap(), - buf, rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (sendBuf.data (), sc, TypeMap < R > (), + buf, rc, TypeMap < R > (), root, + comm.comm)); #endif } else { SafeMpi - ( MPI_Scatter - ( 0, sc, TypeMap(), - buf, rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (0, sc, TypeMap < R > (), + buf, rc, TypeMap < R > (), root, + comm.comm)); } } -template -void Scatter( Complex* buf, int sc, int rc, int root, Comm comm ) +template < typename R > +void Scatter (Complex < R > *buf, int sc, int rc, + int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Scatter")) - const int commRank = Rank( comm ); - if( commRank == root ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Scatter")) + const int commRank = Rank (comm); + + if (commRank == root) { #ifdef EL_AVOID_COMPLEX_MPI -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Scatter - ( buf, 2*sc, TypeMap(), - MPI_IN_PLACE, 2*rc, TypeMap(), root, comm.comm ) ); -# else - const int commSize = Size( comm ); - std::vector> sendBuf( sc*commSize ); - MemCopy( sendBuf.data(), buf, sc*commSize ); + (MPI_Scatter + (buf, 2 * sc, TypeMap < R > (), + MPI_IN_PLACE, 2 * rc, TypeMap < R > (), + root, comm.comm)); +#else + const int commSize = Size (comm); + + std::vector < Complex < + R >> sendBuf (sc * commSize); + MemCopy (sendBuf.data (), buf, sc * commSize); SafeMpi - ( MPI_Scatter - ( sendBuf.data(), 2*sc, TypeMap(), - buf, 2*rc, TypeMap(), root, comm.comm ) ); -# endif + (MPI_Scatter + (sendBuf.data (), 2 * sc, TypeMap < R > (), + buf, 2 * rc, TypeMap < R > (), root, + comm.comm)); +#endif #else -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Scatter - ( buf, sc, TypeMap>(), - MPI_IN_PLACE, rc, TypeMap>(), root, comm.comm ) ); -# else - const int commSize = Size( comm ); - std::vector> sendBuf( sc*commSize ); - MemCopy( sendBuf.data(), buf, sc*commSize ); + (MPI_Scatter + (buf, sc, TypeMap < Complex < R >> (), + MPI_IN_PLACE, rc, + TypeMap < Complex < R >> (), root, + comm.comm)); +#else + const int commSize = Size (comm); + + std::vector < Complex < + R >> sendBuf (sc * commSize); + MemCopy (sendBuf.data (), buf, sc * commSize); SafeMpi - ( MPI_Scatter - ( sendBuf.data(), sc, TypeMap>(), - buf, rc, TypeMap>(), root, comm.comm ) ); -# endif + (MPI_Scatter + (sendBuf.data (), sc, + TypeMap < Complex < R >> (), buf, rc, + TypeMap < Complex < R >> (), root, + comm.comm)); +#endif #endif } else { #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Scatter - ( 0, 2*sc, TypeMap(), - buf, 2*rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (0, 2 * sc, TypeMap < R > (), + buf, 2 * rc, TypeMap < R > (), root, + comm.comm)); #else SafeMpi - ( MPI_Scatter - ( 0, sc, TypeMap>(), - buf, rc, TypeMap>(), root, comm.comm ) ); + (MPI_Scatter + (0, sc, TypeMap < Complex < R >> (), + buf, rc, TypeMap < Complex < R >> (), + root, comm.comm)); #endif } } -template void Scatter( byte* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( int* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( unsigned* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( long int* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( unsigned long* buf, int sc, int rc, int root, Comm comm ); +template void Scatter (byte * buf, int sc, int rc, + int root, Comm comm); +template void Scatter (int *buf, int sc, int rc, int root, + Comm comm); +template void Scatter (unsigned *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (long int *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (unsigned long *buf, int sc, int rc, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Scatter( long long int* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( unsigned long long* buf, int sc, int rc, int root, Comm comm ); +template void Scatter (long long int *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (unsigned long long *buf, int sc, + int rc, int root, Comm comm); #endif -template void Scatter( float* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( double* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( Complex* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( Complex* buf, int sc, int rc, int root, Comm comm ); - -template +template void Scatter (float *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (double *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (Complex < float >*buf, int sc, + int rc, int root, Comm comm); +template void Scatter (Complex < double >*buf, int sc, + int rc, int root, Comm comm); + +template < typename R > void AllToAll -( const R* sbuf, int sc, - R* rbuf, int rc, Comm comm ) +(const R * sbuf, int sc, R * rbuf, int rc, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllToAll")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllToAll")) SafeMpi - ( MPI_Alltoall - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), comm.comm ) ); + (MPI_Alltoall + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), comm.comm)); } -template +template < typename R > void AllToAll -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllToAll")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllToAll")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Alltoall - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), comm.comm ) ); + (MPI_Alltoall + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), comm.comm)); #else SafeMpi - ( MPI_Alltoall - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), comm.comm ) ); + (MPI_Alltoall + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), comm.comm)); #endif } template void AllToAll -( const byte* sbuf, int sc, - byte* rbuf, int rc, Comm comm ); -template void AllToAll -( const int* sbuf, int sc, - int* rbuf, int rc, Comm comm ); -template void AllToAll -( const unsigned* sbuf, int sc, - unsigned* rbuf, int rc, Comm comm ); -template void AllToAll -( const long int* sbuf, int sc, - long int* rbuf, int rc, Comm comm ); -template void AllToAll -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, int rc, Comm comm ); +(const byte * sbuf, int sc, byte * rbuf, int rc, + Comm comm); +template void AllToAll (const int *sbuf, int sc, + int *rbuf, int rc, Comm comm); +template void AllToAll (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, + Comm comm); +template void AllToAll (const long int *sbuf, int sc, + long int *rbuf, int rc, + Comm comm); +template void AllToAll (const unsigned long *sbuf, int sc, + unsigned long *rbuf, int rc, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void AllToAll -( const long long int* sbuf, int sc, - long long int* rbuf, int rc, Comm comm ); +(const long long int *sbuf, int sc, + long long int *rbuf, int rc, Comm comm); template void AllToAll -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, int rc, Comm comm ); +(const unsigned long long *sbuf, int sc, + unsigned long long *rbuf, int rc, Comm comm); #endif template void AllToAll -( const float* sbuf, int sc, - float* rbuf, int rc, Comm comm ); -template void AllToAll -( const double* sbuf, int sc, - double* rbuf, int rc, Comm comm ); -template void AllToAll -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, Comm comm ); -template void AllToAll -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, Comm comm ); - -template +(const float *sbuf, int sc, float *rbuf, int rc, + Comm comm); +template void AllToAll (const double *sbuf, int sc, + double *rbuf, int rc, Comm comm); +template void AllToAll (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + int rc, Comm comm); +template void AllToAll (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + int rc, Comm comm); + +template < typename R > void AllToAll -( const R* sbuf, const int* scs, const int* sds, - R* rbuf, const int* rcs, const int* rds, Comm comm ) +(const R * sbuf, const int *scs, const int *sds, + R * rbuf, const int *rcs, const int *rds, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllToAll")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllToAll")) SafeMpi - ( MPI_Alltoallv - ( const_cast(sbuf), - const_cast(scs), - const_cast(sds), - TypeMap(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap(), - comm.comm ) ); -} - -template + (MPI_Alltoallv + (const_cast < R * >(sbuf), + const_cast < int *>(scs), + const_cast < int *>(sds), + TypeMap < R > (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), TypeMap < R > (), + comm.comm)); +} + +template < typename R > void AllToAll -( const Complex* sbuf, const int* scs, const int* sds, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ) +(const Complex < R > *sbuf, const int *scs, + const int *sds, Complex < R > *rbuf, const int *rcs, + const int *rds, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllToAll")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllToAll")) #ifdef EL_AVOID_COMPLEX_MPI int p; - MPI_Comm_size( comm.comm, &p ); - std::vector scsDoubled(p); - std::vector sdsDoubled(p); - std::vector rcsDoubled(p); - std::vector rdsDoubled(p); - for( int i=0; iscsDoubled (p); + std::vector < int >sdsDoubled (p); + std::vector < int >rcsDoubled (p); + std::vector < int >rdsDoubled (p); + + for (int i = 0; i < p; ++i) + scsDoubled[i] = 2 * scs[i]; + for (int i = 0; i < p; ++i) + sdsDoubled[i] = 2 * sds[i]; + for (int i = 0; i < p; ++i) + rcsDoubled[i] = 2 * rcs[i]; + for (int i = 0; i < p; ++i) + rdsDoubled[i] = 2 * rds[i]; SafeMpi - ( MPI_Alltoallv - ( const_cast*>(sbuf), - scsDoubled.data(), sdsDoubled.data(), TypeMap(), - rbuf, rcsDoubled.data(), rdsDoubled.data(), TypeMap(), comm.comm ) ); + (MPI_Alltoallv + (const_cast < Complex < R > *>(sbuf), + scsDoubled.data (), sdsDoubled.data (), + TypeMap < R > (), rbuf, rcsDoubled.data (), + rdsDoubled.data (), TypeMap < R > (), + comm.comm)); #else SafeMpi - ( MPI_Alltoallv - ( const_cast*>(sbuf), - const_cast(scs), - const_cast(sds), - TypeMap>(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap>(), - comm.comm ) ); + (MPI_Alltoallv + (const_cast < Complex < R > *>(sbuf), + const_cast < int *>(scs), + const_cast < int *>(sds), + TypeMap < Complex < R >> (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), + TypeMap < Complex < R >> (), comm.comm)); #endif } template void AllToAll -( const byte* sbuf, const int* scs, const int* sds, - byte* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const int* sbuf, const int* scs, const int* sds, - int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const unsigned* sbuf, const int* scs, const int* sds, - unsigned* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const long int* sbuf, const int* scs, const int* sds, - long int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const unsigned long* sbuf, const int* scs, const int* sds, - unsigned long* rbuf, const int* rcs, const int* rds, Comm comm ); +(const byte * sbuf, const int *scs, const int *sds, + byte * rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllToAll (const int *sbuf, const int *scs, + const int *sds, int *rbuf, + const int *rcs, const int *rds, + Comm comm); +template void AllToAll (const unsigned *sbuf, + const int *scs, const int *sds, + unsigned *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllToAll (const long int *sbuf, + const int *scs, const int *sds, + long int *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllToAll (const unsigned long *sbuf, + const int *scs, const int *sds, + unsigned long *rbuf, + const int *rcs, const int *rds, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void AllToAll -( const long long int* sbuf, const int* scs, const int* sds, - long long int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const unsigned long long* sbuf, const int* scs, const int* sds, - unsigned long long* rbuf, const int* rcs, const int* rds, Comm comm ); +(const long long int *sbuf, const int *scs, + const int *sds, long long int *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllToAll (const unsigned long long *sbuf, + const int *scs, const int *sds, + unsigned long long *rbuf, + const int *rcs, const int *rds, + Comm comm); #endif template void AllToAll -( const float* sbuf, const int* scs, const int* sds, - float* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const double* sbuf, const int* scs, const int* sds, - double* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const Complex* sbuf, const int* scs, const int* sds, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const Complex* sbuf, const int* scs, const int* sds, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ); - -template +(const float *sbuf, const int *scs, const int *sds, + float *rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllToAll (const double *sbuf, + const int *scs, const int *sds, + double *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllToAll (const Complex < float >*sbuf, + const int *scs, const int *sds, + Complex < float >*rbuf, + const int *rcs, const int *rds, + Comm comm); +template void AllToAll (const Complex < double >*sbuf, + const int *scs, const int *sds, + Complex < double >*rbuf, + const int *rcs, const int *rds, + Comm comm); + +template < typename T > void Reduce -( const T* sbuf, T* rbuf, int count, Op op, int root, Comm comm ) +(const T * sbuf, T * rbuf, int count, Op op, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Reduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Reduce")) + if (count != 0) { - SafeMpi - ( MPI_Reduce - ( const_cast(sbuf), rbuf, count, TypeMap(), - op.op, root, comm.comm ) ); + SafeMpi (MPI_Reduce + (const_cast < T * >(sbuf), rbuf, count, + TypeMap < T > (), op.op, root, + comm.comm)); } } -template +template < typename R > void Reduce -( const Complex* sbuf, - Complex* rbuf, int count, Op op, int root, Comm comm ) +(const Complex < R > *sbuf, + Complex < R > *rbuf, int count, Op op, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Reduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Reduce")) + if (count != 0) { #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { SafeMpi - ( MPI_Reduce - ( const_cast*>(sbuf), - rbuf, 2*count, TypeMap(), op.op, root, comm.comm ) ); + (MPI_Reduce + (const_cast < Complex < R > *>(sbuf), + rbuf, 2 * count, TypeMap < R > (), + op.op, root, comm.comm)); } else { SafeMpi - ( MPI_Reduce - ( const_cast*>(sbuf), - rbuf, count, TypeMap>(), op.op, root, comm.comm ) ); + (MPI_Reduce + (const_cast < Complex < R > *>(sbuf), + rbuf, count, + TypeMap < Complex < R >> (), op.op, + root, comm.comm)); } #else SafeMpi - ( MPI_Reduce - ( const_cast*>(sbuf), - rbuf, count, TypeMap>(), op.op, root, comm.comm ) ); + (MPI_Reduce + (const_cast < Complex < R > *>(sbuf), + rbuf, count, TypeMap < Complex < R >> (), + op.op, root, comm.comm)); #endif } } -template void Reduce( const byte* sbuf, byte* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const int* sbuf, int* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const unsigned* sbuf, unsigned* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const long int* sbuf, long int* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const unsigned long* sbuf, unsigned long* rbuf, int count, Op op, int root, Comm comm ); +template void Reduce (const byte * sbuf, byte * rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const int *sbuf, int *rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const unsigned *sbuf, + unsigned *rbuf, int count, Op op, + int root, Comm comm); +template void Reduce (const long int *sbuf, + long int *rbuf, int count, Op op, + int root, Comm comm); +template void Reduce (const unsigned long *sbuf, + unsigned long *rbuf, int count, + Op op, int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Reduce( const long long int* sbuf, long long int* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const unsigned long long* sbuf, unsigned long long* rbuf, int count, Op op, int root, Comm comm ); -#endif -template void Reduce( const float* sbuf, float* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const double* sbuf, double* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const Complex* sbuf, Complex* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const Complex* sbuf, Complex* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, int root, Comm comm ); - -template -void Reduce( const T* sbuf, T* rbuf, int count, int root, Comm comm ) -{ Reduce( sbuf, rbuf, count, mpi::SUM, root, comm ); } - -template void Reduce( const byte* sbuf, byte* rbuf, int count, int root, Comm comm ); -template void Reduce( const int* sbuf, int* rbuf, int count, int root, Comm comm ); -template void Reduce( const unsigned* sbuf, unsigned* rbuf, int count, int root, Comm comm ); -template void Reduce( const long int* sbuf, long int* rbuf, int count, int root, Comm comm ); -template void Reduce( const unsigned long* sbuf, unsigned long* rbuf, int count, int root, Comm comm ); +template void Reduce (const long long int *sbuf, + long long int *rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const unsigned long long *sbuf, + unsigned long long *rbuf, int count, + Op op, int root, Comm comm); +#endif +template void Reduce (const float *sbuf, float *rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const double *sbuf, double *rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const Complex < float >*sbuf, + Complex < float >*rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const Complex < double >*sbuf, + Complex < double >*rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const ValueInt < Int > *sbuf, + ValueInt < Int > *rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const ValueInt < float >*sbuf, + ValueInt < float >*rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const ValueInt < double >*sbuf, + ValueInt < double >*rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const ValueIntPair < Int > *sbuf, + ValueIntPair < Int > *rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const ValueIntPair < float >*sbuf, + ValueIntPair < float >*rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const ValueIntPair < double >*sbuf, + ValueIntPair < double >*rbuf, + int count, Op op, int root, + Comm comm); + +template < typename T > +void Reduce (const T * sbuf, T * rbuf, int count, + int root, Comm comm) +{ + Reduce (sbuf, rbuf, count, mpi::SUM, root, comm); +} + +template void Reduce (const byte * sbuf, byte * rbuf, + int count, int root, Comm comm); +template void Reduce (const int *sbuf, int *rbuf, + int count, int root, Comm comm); +template void Reduce (const unsigned *sbuf, + unsigned *rbuf, int count, int root, + Comm comm); +template void Reduce (const long int *sbuf, + long int *rbuf, int count, int root, + Comm comm); +template void Reduce (const unsigned long *sbuf, + unsigned long *rbuf, int count, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Reduce( const long long int* sbuf, long long int* rbuf, int count, int root, Comm comm ); -template void Reduce( const unsigned long long* sbuf, unsigned long long* rbuf, int count, int root, Comm comm ); -#endif -template void Reduce( const float* sbuf, float* rbuf, int count, int root, Comm comm ); -template void Reduce( const double* sbuf, double* rbuf, int count, int root, Comm comm ); -template void Reduce( const Complex* sbuf, Complex* rbuf, int count, int root, Comm comm ); -template void Reduce( const Complex* sbuf, Complex* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, int root, Comm comm ); - -template -T Reduce( T sb, Op op, int root, Comm comm ) -{ +template void Reduce (const long long int *sbuf, + long long int *rbuf, int count, + int root, Comm comm); +template void Reduce (const unsigned long long *sbuf, + unsigned long long *rbuf, int count, + int root, Comm comm); +#endif +template void Reduce (const float *sbuf, float *rbuf, + int count, int root, Comm comm); +template void Reduce (const double *sbuf, double *rbuf, + int count, int root, Comm comm); +template void Reduce (const Complex < float >*sbuf, + Complex < float >*rbuf, int count, + int root, Comm comm); +template void Reduce (const Complex < double >*sbuf, + Complex < double >*rbuf, int count, + int root, Comm comm); +template void Reduce (const ValueInt < Int > *sbuf, + ValueInt < Int > *rbuf, int count, + int root, Comm comm); +template void Reduce (const ValueInt < float >*sbuf, + ValueInt < float >*rbuf, int count, + int root, Comm comm); +template void Reduce (const ValueInt < double >*sbuf, + ValueInt < double >*rbuf, int count, + int root, Comm comm); +template void Reduce (const ValueIntPair < Int > *sbuf, + ValueIntPair < Int > *rbuf, + int count, int root, Comm comm); +template void Reduce (const ValueIntPair < float >*sbuf, + ValueIntPair < float >*rbuf, + int count, int root, Comm comm); +template void Reduce (const ValueIntPair < double >*sbuf, + ValueIntPair < double >*rbuf, + int count, int root, Comm comm); + +template < typename T > T Reduce (T sb, Op op, int root, + Comm comm) +{ T rb; - Reduce( &sb, &rb, 1, op, root, comm ); + + Reduce (&sb, &rb, 1, op, root, comm); return rb; } -template byte Reduce( byte sb, Op op, int root, Comm comm ); -template int Reduce( int sb, Op op, int root, Comm comm ); -template unsigned Reduce( unsigned sb, Op op, int root, Comm comm ); -template long int Reduce( long int sb, Op op, int root, Comm comm ); -template unsigned long Reduce( unsigned long sb, Op op, int root, Comm comm ); +template byte Reduce (byte sb, Op op, int root, + Comm comm); +template int Reduce (int sb, Op op, int root, Comm comm); +template unsigned Reduce (unsigned sb, Op op, int root, + Comm comm); +template long int Reduce (long int sb, Op op, int root, + Comm comm); +template unsigned long Reduce (unsigned long sb, Op op, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int Reduce( long long int sb, Op op, int root, Comm comm ); -template unsigned long long Reduce( unsigned long long sb, Op op, int root, Comm comm ); -#endif -template float Reduce( float sb, Op op, int root, Comm comm ); -template double Reduce( double sb, Op op, int root, Comm comm ); -template Complex Reduce( Complex sb, Op op, int root, Comm comm ); -template Complex Reduce( Complex sb, Op op, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, Op op, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, Op op, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, Op op, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, Op op, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, Op op, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, Op op, int root, Comm comm ); - -template -T Reduce( T sb, int root, Comm comm ) -{ +template long long int Reduce (long long int sb, Op op, + int root, Comm comm); +template unsigned long long Reduce (unsigned long long sb, + Op op, int root, + Comm comm); +#endif +template float Reduce (float sb, Op op, int root, + Comm comm); +template double Reduce (double sb, Op op, int root, + Comm comm); +template Complex < float >Reduce (Complex < float >sb, + Op op, int root, + Comm comm); +template Complex < double >Reduce (Complex < double >sb, + Op op, int root, + Comm comm); +template ValueInt < Int > Reduce (ValueInt < Int > sb, + Op op, int root, + Comm comm); +template ValueInt < float >Reduce (ValueInt < float >sb, + Op op, int root, + Comm comm); +template ValueInt < double >Reduce (ValueInt < double >sb, + Op op, int root, + Comm comm); +template ValueIntPair < Int > Reduce (ValueIntPair < Int > + sb, Op op, int root, + Comm comm); +template ValueIntPair < float >Reduce (ValueIntPair < + float >sb, Op op, + int root, + Comm comm); +template ValueIntPair < double >Reduce (ValueIntPair < + double >sb, Op op, + int root, + Comm comm); + +template < typename T > T Reduce (T sb, int root, + Comm comm) +{ T rb; - Reduce( &sb, &rb, 1, mpi::SUM, root, comm ); + + Reduce (&sb, &rb, 1, mpi::SUM, root, comm); return rb; } -template byte Reduce( byte sb, int root, Comm comm ); -template int Reduce( int sb, int root, Comm comm ); -template unsigned Reduce( unsigned sb, int root, Comm comm ); -template long int Reduce( long int sb, int root, Comm comm ); -template unsigned long Reduce( unsigned long sb, int root, Comm comm ); +template byte Reduce (byte sb, int root, Comm comm); +template int Reduce (int sb, int root, Comm comm); +template unsigned Reduce (unsigned sb, int root, + Comm comm); +template long int Reduce (long int sb, int root, + Comm comm); +template unsigned long Reduce (unsigned long sb, int root, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int Reduce( long long int sb, int root, Comm comm ); -template unsigned long long Reduce( unsigned long long sb, int root, Comm comm ); -#endif -template float Reduce( float sb, int root, Comm comm ); -template double Reduce( double sb, int root, Comm comm ); -template Complex Reduce( Complex sb, int root, Comm comm ); -template Complex Reduce( Complex sb, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, int root, Comm comm ); - -template -void Reduce( T* buf, int count, Op op, int root, Comm comm ) -{ - DEBUG_ONLY(CallStackEntry cse("mpi::Reduce")) - if( count != 0 ) +template long long int Reduce (long long int sb, int root, + Comm comm); +template unsigned long long Reduce (unsigned long long sb, + int root, Comm comm); +#endif +template float Reduce (float sb, int root, Comm comm); +template double Reduce (double sb, int root, Comm comm); +template Complex < float >Reduce (Complex < float >sb, + int root, Comm comm); +template Complex < double >Reduce (Complex < double >sb, + int root, Comm comm); +template ValueInt < Int > Reduce (ValueInt < Int > sb, + int root, Comm comm); +template ValueInt < float >Reduce (ValueInt < float >sb, + int root, Comm comm); +template ValueInt < double >Reduce (ValueInt < double >sb, + int root, Comm comm); +template ValueIntPair < Int > Reduce (ValueIntPair < Int > + sb, int root, + Comm comm); +template ValueIntPair < float >Reduce (ValueIntPair < + float >sb, + int root, + Comm comm); +template ValueIntPair < double >Reduce (ValueIntPair < + double >sb, + int root, + Comm comm); + +template < typename T > +void Reduce (T * buf, int count, Op op, int root, + Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Reduce")) + if (count != 0) { - const int commRank = Rank( comm ); - if( commRank == root ) + const int commRank = Rank (comm); + + if (commRank == root) { #ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce - ( MPI_IN_PLACE, buf, count, TypeMap(), op.op, root, - comm.comm ) ); + (MPI_Reduce + (MPI_IN_PLACE, buf, count, + TypeMap < T > (), op.op, root, + comm.comm)); #else - std::vector sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); + std::vector < T > sendBuf (count); + MemCopy (sendBuf.data (), buf, count); SafeMpi - ( MPI_Reduce - ( sendBuf.data(), buf, count, TypeMap(), op.op, root, - comm.comm ) ); + (MPI_Reduce + (sendBuf.data (), buf, count, + TypeMap < T > (), op.op, root, + comm.comm)); #endif } else SafeMpi - ( MPI_Reduce - ( buf, 0, count, TypeMap(), op.op, root, comm.comm ) ); + (MPI_Reduce + (buf, 0, count, TypeMap < T > (), + op.op, root, comm.comm)); } } -template -void Reduce( Complex* buf, int count, Op op, int root, Comm comm ) +template < typename R > +void Reduce (Complex < R > *buf, int count, Op op, + int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Reduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Reduce")) + if (count != 0) { - const int commRank = Rank( comm ); + const int commRank = Rank (comm); + #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { - if( commRank == root ) + if (commRank == root) { -# ifdef EL_HAVE_MPI_IN_PLACE - SafeMpi - ( MPI_Reduce - ( MPI_IN_PLACE, buf, 2*count, TypeMap(), op.op, - root, comm.comm ) ); -# else - std::vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce - ( sendBuf.data(), buf, 2*count, TypeMap(), op.op, - root, comm.comm ) ); -# endif + (MPI_Reduce + (MPI_IN_PLACE, buf, 2 * count, + TypeMap < R > (), op.op, root, + comm.comm)); +#else + std::vector < Complex < + R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, + count); + SafeMpi (MPI_Reduce + (sendBuf.data (), buf, + 2 * count, + TypeMap < R > (), op.op, + root, comm.comm)); +#endif } else SafeMpi - ( MPI_Reduce - ( buf, 0, 2*count, TypeMap(), op.op, root, comm.comm ) ); + (MPI_Reduce + (buf, 0, 2 * count, + TypeMap < R > (), op.op, root, + comm.comm)); } else { - if( commRank == root ) + if (commRank == root) { -# ifdef EL_HAVE_MPI_IN_PLACE - SafeMpi - ( MPI_Reduce - ( MPI_IN_PLACE, buf, count, TypeMap>(), op.op, - root, comm.comm ) ); -# else - std::vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce - ( sendBuf.data(), buf, count, TypeMap>(), op.op, - root, comm.comm ) ); -# endif + (MPI_Reduce + (MPI_IN_PLACE, buf, count, + TypeMap < Complex < R >> (), + op.op, root, comm.comm)); +#else + std::vector < Complex < + R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, + count); + SafeMpi (MPI_Reduce + (sendBuf.data (), buf, + count, + TypeMap < Complex < + R >> (), op.op, root, + comm.comm)); +#endif } else SafeMpi - ( MPI_Reduce - ( buf, 0, count, TypeMap>(), op.op, - root, comm.comm ) ); + (MPI_Reduce + (buf, 0, count, + TypeMap < Complex < R >> (), + op.op, root, comm.comm)); } #else - if( commRank == root ) + if (commRank == root) { -# ifdef EL_HAVE_MPI_IN_PLACE - SafeMpi - ( MPI_Reduce - ( MPI_IN_PLACE, buf, count, TypeMap>(), op.op, - root, comm.comm ) ); -# else - std::vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce - ( sendBuf.data(), buf, count, TypeMap>(), op.op, - root, comm.comm ) ); -# endif + (MPI_Reduce + (MPI_IN_PLACE, buf, count, + TypeMap < Complex < R >> (), op.op, + root, comm.comm)); +#else + std::vector < Complex < + R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, count); + SafeMpi + (MPI_Reduce + (sendBuf.data (), buf, count, + TypeMap < Complex < R >> (), op.op, + root, comm.comm)); +#endif } else SafeMpi - ( MPI_Reduce - ( buf, 0, count, TypeMap>(), op.op, root, - comm.comm ) ); + (MPI_Reduce + (buf, 0, count, + TypeMap < Complex < R >> (), op.op, + root, comm.comm)); #endif } } -template void Reduce( byte* buf, int count, Op op, int root, Comm comm ); -template void Reduce( int* buf, int count, Op op, int root, Comm comm ); -template void Reduce( unsigned* buf, int count, Op op, int root, Comm comm ); -template void Reduce( long int* buf, int count, Op op, int root, Comm comm ); -template void Reduce( unsigned long* buf, int count, Op op, int root, Comm comm ); +template void Reduce (byte * buf, int count, Op op, + int root, Comm comm); +template void Reduce (int *buf, int count, Op op, + int root, Comm comm); +template void Reduce (unsigned *buf, int count, Op op, + int root, Comm comm); +template void Reduce (long int *buf, int count, Op op, + int root, Comm comm); +template void Reduce (unsigned long *buf, int count, + Op op, int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Reduce( long long int* buf, int count, Op op, int root, Comm comm ); -template void Reduce( unsigned long long* buf, int count, Op op, int root, Comm comm ); -#endif -template void Reduce( float* buf, int count, Op op, int root, Comm comm ); -template void Reduce( double* buf, int count, Op op, int root, Comm comm ); -template void Reduce( Complex* buf, int count, Op op, int root, Comm comm ); -template void Reduce( Complex* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, Op op, int root, Comm comm ); - -template -void Reduce( T* buf, int count, int root, Comm comm ) -{ Reduce( buf, count, mpi::SUM, root, comm ); } - -template void Reduce( byte* buf, int count, int root, Comm comm ); -template void Reduce( int* buf, int count, int root, Comm comm ); -template void Reduce( unsigned* buf, int count, int root, Comm comm ); -template void Reduce( long int* buf, int count, int root, Comm comm ); -template void Reduce( unsigned long* buf, int count, int root, Comm comm ); +template void Reduce (long long int *buf, int count, + Op op, int root, Comm comm); +template void Reduce (unsigned long long *buf, int count, + Op op, int root, Comm comm); +#endif +template void Reduce (float *buf, int count, Op op, + int root, Comm comm); +template void Reduce (double *buf, int count, Op op, + int root, Comm comm); +template void Reduce (Complex < float >*buf, int count, + Op op, int root, Comm comm); +template void Reduce (Complex < double >*buf, int count, + Op op, int root, Comm comm); +template void Reduce (ValueInt < Int > *buf, int count, + Op op, int root, Comm comm); +template void Reduce (ValueInt < float >*buf, int count, + Op op, int root, Comm comm); +template void Reduce (ValueInt < double >*buf, int count, + Op op, int root, Comm comm); +template void Reduce (ValueIntPair < Int > *buf, + int count, Op op, int root, + Comm comm); +template void Reduce (ValueIntPair < float >*buf, + int count, Op op, int root, + Comm comm); +template void Reduce (ValueIntPair < double >*buf, + int count, Op op, int root, + Comm comm); + +template < typename T > +void Reduce (T * buf, int count, int root, Comm comm) +{ + Reduce (buf, count, mpi::SUM, root, comm); +} + +template void Reduce (byte * buf, int count, int root, + Comm comm); +template void Reduce (int *buf, int count, int root, + Comm comm); +template void Reduce (unsigned *buf, int count, int root, + Comm comm); +template void Reduce (long int *buf, int count, int root, + Comm comm); +template void Reduce (unsigned long *buf, int count, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Reduce( long long int* buf, int count, int root, Comm comm ); -template void Reduce( unsigned long long* buf, int count, int root, Comm comm ); -#endif -template void Reduce( float* buf, int count, int root, Comm comm ); -template void Reduce( double* buf, int count, int root, Comm comm ); -template void Reduce( Complex* buf, int count, int root, Comm comm ); -template void Reduce( Complex* buf, int count, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, int root, Comm comm ); - -template -void AllReduce( const T* sbuf, T* rbuf, int count, Op op, Comm comm ) -{ - DEBUG_ONLY(CallStackEntry cse("mpi::AllReduce")) - if( count != 0 ) +template void Reduce (long long int *buf, int count, + int root, Comm comm); +template void Reduce (unsigned long long *buf, int count, + int root, Comm comm); +#endif +template void Reduce (float *buf, int count, int root, + Comm comm); +template void Reduce (double *buf, int count, int root, + Comm comm); +template void Reduce (Complex < float >*buf, int count, + int root, Comm comm); +template void Reduce (Complex < double >*buf, int count, + int root, Comm comm); +template void Reduce (ValueInt < Int > *buf, int count, + int root, Comm comm); +template void Reduce (ValueInt < float >*buf, int count, + int root, Comm comm); +template void Reduce (ValueInt < double >*buf, int count, + int root, Comm comm); +template void Reduce (ValueIntPair < Int > *buf, + int count, int root, Comm comm); +template void Reduce (ValueIntPair < float >*buf, + int count, int root, Comm comm); +template void Reduce (ValueIntPair < double >*buf, + int count, int root, Comm comm); + +template < typename T > +void AllReduce (const T * sbuf, T * rbuf, int count, + Op op, Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::AllReduce")) + if (count != 0) { - SafeMpi - ( MPI_Allreduce - ( const_cast(sbuf), rbuf, count, TypeMap(), op.op, - comm.comm ) ); + SafeMpi (MPI_Allreduce + (const_cast < T * >(sbuf), rbuf, count, + TypeMap < T > (), op.op, comm.comm)); } } -template +template < typename R > void AllReduce -( const Complex* sbuf, Complex* rbuf, int count, Op op, Comm comm ) +(const Complex < R > *sbuf, Complex < R > *rbuf, + int count, Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllReduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllReduce")) + if (count != 0) { #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { SafeMpi - ( MPI_Allreduce - ( const_cast*>(sbuf), - rbuf, 2*count, TypeMap(), op.op, comm.comm ) ); + (MPI_Allreduce + (const_cast < Complex < R > *>(sbuf), + rbuf, 2 * count, TypeMap < R > (), + op.op, comm.comm)); } else { SafeMpi - ( MPI_Allreduce - ( const_cast*>(sbuf), - rbuf, count, TypeMap>(), op.op, comm.comm ) ); + (MPI_Allreduce + (const_cast < Complex < R > *>(sbuf), + rbuf, count, + TypeMap < Complex < R >> (), op.op, + comm.comm)); } #else SafeMpi - ( MPI_Allreduce - ( const_cast*>(sbuf), - rbuf, count, TypeMap>(), op.op, comm.comm ) ); + (MPI_Allreduce + (const_cast < Complex < R > *>(sbuf), + rbuf, count, TypeMap < Complex < R >> (), + op.op, comm.comm)); #endif } } -template void AllReduce( const byte* sbuf, byte* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const int* sbuf, int* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const unsigned* sbuf, unsigned* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const long int* sbuf, long int* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const unsigned long* sbuf, unsigned long* rbuf, int count, Op op, Comm comm ); +template void AllReduce (const byte * sbuf, byte * rbuf, + int count, Op op, Comm comm); +template void AllReduce (const int *sbuf, int *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const unsigned *sbuf, + unsigned *rbuf, int count, Op op, + Comm comm); +template void AllReduce (const long int *sbuf, + long int *rbuf, int count, Op op, + Comm comm); +template void AllReduce (const unsigned long *sbuf, + unsigned long *rbuf, int count, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void AllReduce( const long long int* sbuf, long long int* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const unsigned long long* sbuf, unsigned long long* rbuf, int count, Op op, Comm comm ); -#endif -template void AllReduce( const float* sbuf, float* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const double* sbuf, double* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const Complex* sbuf, Complex* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const Complex* sbuf, Complex* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, Comm comm ); - -template -void AllReduce( const T* sbuf, T* rbuf, int count, Comm comm ) -{ AllReduce( sbuf, rbuf, count, mpi::SUM, comm ); } - -template void AllReduce( const byte* sbuf, byte* rbuf, int count, Comm comm ); -template void AllReduce( const int* sbuf, int* rbuf, int count, Comm comm ); -template void AllReduce( const unsigned* sbuf, unsigned* rbuf, int count, Comm comm ); -template void AllReduce( const long int* sbuf, long int* rbuf, int count, Comm comm ); -template void AllReduce( const unsigned long* sbuf, unsigned long* rbuf, int count, Comm comm ); +template void AllReduce (const long long int *sbuf, + long long int *rbuf, int count, + Op op, Comm comm); +template void AllReduce (const unsigned long long *sbuf, + unsigned long long *rbuf, + int count, Op op, Comm comm); +#endif +template void AllReduce (const float *sbuf, float *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const double *sbuf, double *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const Complex < float >*sbuf, + Complex < float >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const Complex < double >*sbuf, + Complex < double >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueInt < Int > *sbuf, + ValueInt < Int > *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueInt < float >*sbuf, + ValueInt < float >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueInt < double >*sbuf, + ValueInt < double >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueIntPair < Int > *sbuf, + ValueIntPair < Int > *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueIntPair < + float >*sbuf, + ValueIntPair < float >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueIntPair < + double >*sbuf, + ValueIntPair < double >*rbuf, + int count, Op op, Comm comm); + +template < typename T > +void AllReduce (const T * sbuf, T * rbuf, int count, + Comm comm) +{ + AllReduce (sbuf, rbuf, count, mpi::SUM, comm); +} + +template void AllReduce (const byte * sbuf, byte * rbuf, + int count, Comm comm); +template void AllReduce (const int *sbuf, int *rbuf, + int count, Comm comm); +template void AllReduce (const unsigned *sbuf, + unsigned *rbuf, int count, + Comm comm); +template void AllReduce (const long int *sbuf, + long int *rbuf, int count, + Comm comm); +template void AllReduce (const unsigned long *sbuf, + unsigned long *rbuf, int count, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void AllReduce( const long long int* sbuf, long long int* rbuf, int count, Comm comm ); -template void AllReduce( const unsigned long long* sbuf, unsigned long long* rbuf, int count, Comm comm ); -#endif -template void AllReduce( const float* sbuf, float* rbuf, int count, Comm comm ); -template void AllReduce( const double* sbuf, double* rbuf, int count, Comm comm ); -template void AllReduce( const Complex* sbuf, Complex* rbuf, int count, Comm comm ); -template void AllReduce( const Complex* sbuf, Complex* rbuf, int count, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Comm comm ); - -template -T AllReduce( T sb, Op op, Comm comm ) -{ T rb; AllReduce( &sb, &rb, 1, op, comm ); return rb; } - -template byte AllReduce( byte sb, Op op, Comm comm ); -template int AllReduce( int sb, Op op, Comm comm ); -template unsigned AllReduce( unsigned sb, Op op, Comm comm ); -template long int AllReduce( long int sb, Op op, Comm comm ); -template unsigned long AllReduce( unsigned long sb, Op op, Comm comm ); +template void AllReduce (const long long int *sbuf, + long long int *rbuf, int count, + Comm comm); +template void AllReduce (const unsigned long long *sbuf, + unsigned long long *rbuf, + int count, Comm comm); +#endif +template void AllReduce (const float *sbuf, float *rbuf, + int count, Comm comm); +template void AllReduce (const double *sbuf, double *rbuf, + int count, Comm comm); +template void AllReduce (const Complex < float >*sbuf, + Complex < float >*rbuf, + int count, Comm comm); +template void AllReduce (const Complex < double >*sbuf, + Complex < double >*rbuf, + int count, Comm comm); +template void AllReduce (const ValueInt < Int > *sbuf, + ValueInt < Int > *rbuf, + int count, Comm comm); +template void AllReduce (const ValueInt < float >*sbuf, + ValueInt < float >*rbuf, + int count, Comm comm); +template void AllReduce (const ValueInt < double >*sbuf, + ValueInt < double >*rbuf, + int count, Comm comm); +template void AllReduce (const ValueIntPair < Int > *sbuf, + ValueIntPair < Int > *rbuf, + int count, Comm comm); +template void AllReduce (const ValueIntPair < + float >*sbuf, + ValueIntPair < float >*rbuf, + int count, Comm comm); +template void AllReduce (const ValueIntPair < + double >*sbuf, + ValueIntPair < double >*rbuf, + int count, Comm comm); + +template < typename T > T AllReduce (T sb, Op op, + Comm comm) +{ + T rb; + + AllReduce (&sb, &rb, 1, op, comm); + return rb; +} + +template byte AllReduce (byte sb, Op op, Comm comm); +template int AllReduce (int sb, Op op, Comm comm); +template unsigned AllReduce (unsigned sb, Op op, + Comm comm); +template long int AllReduce (long int sb, Op op, + Comm comm); +template unsigned long AllReduce (unsigned long sb, Op op, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int AllReduce( long long int sb, Op op, Comm comm ); -template unsigned long long AllReduce( unsigned long long sb, Op op, Comm comm ); -#endif -template float AllReduce( float sb, Op op, Comm comm ); -template double AllReduce( double sb, Op op, Comm comm ); -template Complex AllReduce( Complex sb, Op op, Comm comm ); -template Complex AllReduce( Complex sb, Op op, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Op op, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Op op, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Op op, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Op op, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Op op, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Op op, Comm comm ); - -template -T AllReduce( T sb, Comm comm ) -{ return AllReduce( sb, mpi::SUM, comm ); } - -template byte AllReduce( byte sb, Comm comm ); -template int AllReduce( int sb, Comm comm ); -template unsigned AllReduce( unsigned sb, Comm comm ); -template long int AllReduce( long int sb, Comm comm ); -template unsigned long AllReduce( unsigned long sb, Comm comm ); +template long long int AllReduce (long long int sb, Op op, + Comm comm); +template unsigned long long AllReduce (unsigned long long + sb, Op op, + Comm comm); +#endif +template float AllReduce (float sb, Op op, Comm comm); +template double AllReduce (double sb, Op op, Comm comm); +template Complex < float >AllReduce (Complex < float >sb, + Op op, Comm comm); +template Complex < double >AllReduce (Complex < + double >sb, Op op, + Comm comm); +template ValueInt < Int > AllReduce (ValueInt < Int > sb, + Op op, Comm comm); +template ValueInt < float >AllReduce (ValueInt < + float >sb, Op op, + Comm comm); +template ValueInt < double >AllReduce (ValueInt < + double >sb, Op op, + Comm comm); +template ValueIntPair < Int > AllReduce (ValueIntPair < + Int > sb, Op op, + Comm comm); +template ValueIntPair < float >AllReduce (ValueIntPair < + float >sb, + Op op, + Comm comm); +template ValueIntPair < double >AllReduce (ValueIntPair < + double >sb, + Op op, + Comm comm); + +template < typename T > T AllReduce (T sb, Comm comm) +{ + return AllReduce (sb, mpi::SUM, comm); +} + +template byte AllReduce (byte sb, Comm comm); +template int AllReduce (int sb, Comm comm); +template unsigned AllReduce (unsigned sb, Comm comm); +template long int AllReduce (long int sb, Comm comm); +template unsigned long AllReduce (unsigned long sb, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int AllReduce( long long int sb, Comm comm ); -template unsigned long long AllReduce( unsigned long long sb, Comm comm ); -#endif -template float AllReduce( float sb, Comm comm ); -template double AllReduce( double sb, Comm comm ); -template Complex AllReduce( Complex sb, Comm comm ); -template Complex AllReduce( Complex sb, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Comm comm ); - -template -void AllReduce( T* buf, int count, Op op, Comm comm ) -{ - DEBUG_ONLY(CallStackEntry cse("mpi::AllReduce")) - if( count != 0 ) +template long long int AllReduce (long long int sb, + Comm comm); +template unsigned long long AllReduce (unsigned long long + sb, Comm comm); +#endif +template float AllReduce (float sb, Comm comm); +template double AllReduce (double sb, Comm comm); +template Complex < float >AllReduce (Complex < float >sb, + Comm comm); +template Complex < double >AllReduce (Complex < + double >sb, + Comm comm); +template ValueInt < Int > AllReduce (ValueInt < Int > sb, + Comm comm); +template ValueInt < float >AllReduce (ValueInt < + float >sb, + Comm comm); +template ValueInt < double >AllReduce (ValueInt < + double >sb, + Comm comm); +template ValueIntPair < Int > AllReduce (ValueIntPair < + Int > sb, + Comm comm); +template ValueIntPair < float >AllReduce (ValueIntPair < + float >sb, + Comm comm); +template ValueIntPair < double >AllReduce (ValueIntPair < + double >sb, + Comm comm); + +template < typename T > +void AllReduce (T * buf, int count, Op op, Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::AllReduce")) + if (count != 0) { #ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Allreduce - ( MPI_IN_PLACE, buf, count, TypeMap(), op.op, comm.comm ) ); + (MPI_Allreduce + (MPI_IN_PLACE, buf, count, + TypeMap < T > (), op.op, comm.comm)); #else - std::vector sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); + std::vector < T > sendBuf (count); + MemCopy (sendBuf.data (), buf, count); SafeMpi - ( MPI_Allreduce - ( sendBuf.data(), buf, count, TypeMap(), op.op, comm.comm ) ); + (MPI_Allreduce + (sendBuf.data (), buf, count, + TypeMap < T > (), op.op, comm.comm)); #endif } } -template -void AllReduce( Complex* buf, int count, Op op, Comm comm ) +template < typename R > +void AllReduce (Complex < R > *buf, int count, Op op, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllReduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllReduce")) + if (count != 0) { #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Allreduce - ( MPI_IN_PLACE, buf, 2*count, TypeMap(), op.op, comm.comm ) ); -# else - std::vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); + (MPI_Allreduce + (MPI_IN_PLACE, buf, 2 * count, + TypeMap < R > (), op.op, + comm.comm)); +#else + std::vector < Complex < + R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, count); SafeMpi - ( MPI_Allreduce - ( sendBuf.data(), buf, 2*count, TypeMap(), op.op, - comm.comm ) ); -# endif + (MPI_Allreduce + (sendBuf.data (), buf, 2 * count, + TypeMap < R > (), op.op, + comm.comm)); +#endif } else { -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Allreduce - ( MPI_IN_PLACE, buf, count, TypeMap>(), - op.op, comm.comm ) ); -# else - std::vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); + (MPI_Allreduce + (MPI_IN_PLACE, buf, count, + TypeMap < Complex < R >> (), op.op, + comm.comm)); +#else + std::vector < Complex < + R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, count); SafeMpi - ( MPI_Allreduce - ( sendBuf.data(), buf, count, TypeMap>(), - op.op, comm.comm ) ); -# endif + (MPI_Allreduce + (sendBuf.data (), buf, count, + TypeMap < Complex < R >> (), op.op, + comm.comm)); +#endif } #else -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Allreduce - ( MPI_IN_PLACE, buf, count, TypeMap>(), op.op, - comm.comm ) ); -# else - std::vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); + (MPI_Allreduce + (MPI_IN_PLACE, buf, count, + TypeMap < Complex < R >> (), op.op, + comm.comm)); +#else + std::vector < Complex < R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, count); SafeMpi - ( MPI_Allreduce - ( sendBuf.data(), buf, count, TypeMap>(), op.op, - comm.comm ) ); -# endif + (MPI_Allreduce + (sendBuf.data (), buf, count, + TypeMap < Complex < R >> (), op.op, + comm.comm)); +#endif #endif } } -template void AllReduce( byte* buf, int count, Op op, Comm comm ); -template void AllReduce( int* buf, int count, Op op, Comm comm ); -template void AllReduce( unsigned* buf, int count, Op op, Comm comm ); -template void AllReduce( long int* buf, int count, Op op, Comm comm ); -template void AllReduce( unsigned long* buf, int count, Op op, Comm comm ); +template void AllReduce (byte * buf, int count, Op op, + Comm comm); +template void AllReduce (int *buf, int count, Op op, + Comm comm); +template void AllReduce (unsigned *buf, int count, Op op, + Comm comm); +template void AllReduce (long int *buf, int count, Op op, + Comm comm); +template void AllReduce (unsigned long *buf, int count, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void AllReduce( long long int* buf, int count, Op op, Comm comm ); -template void AllReduce( unsigned long long* buf, int count, Op op, Comm comm ); -#endif -template void AllReduce( float* buf, int count, Op op, Comm comm ); -template void AllReduce( double* buf, int count, Op op, Comm comm ); -template void AllReduce( Complex* buf, int count, Op op, Comm comm ); -template void AllReduce( Complex* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Op op, Comm comm ); - -template -void AllReduce( T* buf, int count, Comm comm ) -{ AllReduce( buf, count, mpi::SUM, comm ); } - -template void AllReduce( byte* buf, int count, Comm comm ); -template void AllReduce( int* buf, int count, Comm comm ); -template void AllReduce( unsigned* buf, int count, Comm comm ); -template void AllReduce( long int* buf, int count, Comm comm ); -template void AllReduce( unsigned long* buf, int count, Comm comm ); +template void AllReduce (long long int *buf, int count, + Op op, Comm comm); +template void AllReduce (unsigned long long *buf, + int count, Op op, Comm comm); +#endif +template void AllReduce (float *buf, int count, Op op, + Comm comm); +template void AllReduce (double *buf, int count, Op op, + Comm comm); +template void AllReduce (Complex < float >*buf, int count, + Op op, Comm comm); +template void AllReduce (Complex < double >*buf, + int count, Op op, Comm comm); +template void AllReduce (ValueInt < Int > *buf, int count, + Op op, Comm comm); +template void AllReduce (ValueInt < float >*buf, + int count, Op op, Comm comm); +template void AllReduce (ValueInt < double >*buf, + int count, Op op, Comm comm); +template void AllReduce (ValueIntPair < Int > *buf, + int count, Op op, Comm comm); +template void AllReduce (ValueIntPair < float >*buf, + int count, Op op, Comm comm); +template void AllReduce (ValueIntPair < double >*buf, + int count, Op op, Comm comm); + +template < typename T > +void AllReduce (T * buf, int count, Comm comm) +{ + AllReduce (buf, count, mpi::SUM, comm); +} + +template void AllReduce (byte * buf, int count, + Comm comm); +template void AllReduce (int *buf, int count, Comm comm); +template void AllReduce (unsigned *buf, int count, + Comm comm); +template void AllReduce (long int *buf, int count, + Comm comm); +template void AllReduce (unsigned long *buf, int count, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void AllReduce( long long int* buf, int count, Comm comm ); -template void AllReduce( unsigned long long* buf, int count, Comm comm ); -#endif -template void AllReduce( float* buf, int count, Comm comm ); -template void AllReduce( double* buf, int count, Comm comm ); -template void AllReduce( Complex* buf, int count, Comm comm ); -template void AllReduce( Complex* buf, int count, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Comm comm ); - -template -void ReduceScatter( R* sbuf, R* rbuf, int rc, Op op, Comm comm ) -{ - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) +template void AllReduce (long long int *buf, int count, + Comm comm); +template void AllReduce (unsigned long long *buf, + int count, Comm comm); +#endif +template void AllReduce (float *buf, int count, + Comm comm); +template void AllReduce (double *buf, int count, + Comm comm); +template void AllReduce (Complex < float >*buf, int count, + Comm comm); +template void AllReduce (Complex < double >*buf, + int count, Comm comm); +template void AllReduce (ValueInt < Int > *buf, int count, + Comm comm); +template void AllReduce (ValueInt < float >*buf, + int count, Comm comm); +template void AllReduce (ValueInt < double >*buf, + int count, Comm comm); +template void AllReduce (ValueIntPair < Int > *buf, + int count, Comm comm); +template void AllReduce (ValueIntPair < float >*buf, + int count, Comm comm); +template void AllReduce (ValueIntPair < double >*buf, + int count, Comm comm); + +template < typename R > +void ReduceScatter (R * sbuf, R * rbuf, int rc, Op op, + Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_REDUCE_SCATTER_BLOCK_VIA_ALLREDUCE - const int commSize = Size( comm ); - const int commRank = Rank( comm ); - AllReduce( sbuf, rc*commSize, op, comm ); - MemCopy( rbuf, &sbuf[commRank*rc], rc ); + const int commSize = Size (comm); + const int commRank = Rank (comm); + + AllReduce (sbuf, rc * commSize, op, comm); + MemCopy (rbuf, &sbuf[commRank * rc], rc); #elif defined(EL_HAVE_MPI_REDUCE_SCATTER_BLOCK) SafeMpi - ( MPI_Reduce_scatter_block - ( sbuf, rbuf, rc, TypeMap(), op.op, comm.comm ) ); + (MPI_Reduce_scatter_block + (sbuf, rbuf, rc, TypeMap < R > (), op.op, + comm.comm)); #else - const int commSize = Size( comm ); - Reduce( sbuf, rc*commSize, op, 0, comm ); - Scatter( sbuf, rc, rbuf, rc, 0, comm ); + const int commSize = Size (comm); + + Reduce (sbuf, rc * commSize, op, 0, comm); + Scatter (sbuf, rc, rbuf, rc, 0, comm); #endif } -template +template < typename R > void ReduceScatter -( Complex* sbuf, Complex* rbuf, int rc, Op op, Comm comm ) +(Complex < R > *sbuf, Complex < R > *rbuf, int rc, + Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_REDUCE_SCATTER_BLOCK_VIA_ALLREDUCE - const int commSize = Size( comm ); - const int commRank = Rank( comm ); - AllReduce( sbuf, rc*commSize, op, comm ); - MemCopy( rbuf, &sbuf[commRank*rc], rc ); + const int commSize = Size (comm); + const int commRank = Rank (comm); + + AllReduce (sbuf, rc * commSize, op, comm); + MemCopy (rbuf, &sbuf[commRank * rc], rc); #elif defined(EL_HAVE_MPI_REDUCE_SCATTER_BLOCK) -# ifdef EL_AVOID_COMPLEX_MPI +#ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Reduce_scatter_block - ( sbuf, rbuf, 2*rc, TypeMap(), op.op, comm.comm ) ); -# else + (MPI_Reduce_scatter_block + (sbuf, rbuf, 2 * rc, TypeMap < R > (), op.op, + comm.comm)); +#else SafeMpi - ( MPI_Reduce_scatter_block - ( sbuf, rbuf, rc, TypeMap>(), op.op, comm.comm ) ); -# endif + (MPI_Reduce_scatter_block + (sbuf, rbuf, rc, TypeMap < Complex < R >> (), + op.op, comm.comm)); +#endif #else - const int commSize = Size( comm ); - Reduce( sbuf, rc*commSize, op, 0, comm ); - Scatter( sbuf, rc, rbuf, rc, 0, comm ); + const int commSize = Size (comm); + + Reduce (sbuf, rc * commSize, op, 0, comm); + Scatter (sbuf, rc, rbuf, rc, 0, comm); #endif } -template void ReduceScatter( byte* sbuf, byte* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( int* sbuf, int* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned* sbuf, unsigned* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( long int* sbuf, long int* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned long* sbuf, unsigned long* rbuf, int rc, Op op, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( long long int* sbuf, long long int* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned long long* sbuf, unsigned long long* rbuf, int rc, Op op, Comm comm ); -#endif -template void ReduceScatter( float* sbuf, float* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( double* sbuf, double* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( Complex* sbuf, Complex* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( Complex* sbuf, Complex* rbuf, int rc, Op op, Comm comm ); - -template -void ReduceScatter( T* sbuf, T* rbuf, int rc, Comm comm ) -{ ReduceScatter( sbuf, rbuf, rc, mpi::SUM, comm ); } - -template void ReduceScatter( byte* sbuf, byte* rbuf, int rc, Comm comm ); -template void ReduceScatter( int* sbuf, int* rbuf, int rc, Comm comm ); -template void ReduceScatter( unsigned* sbuf, unsigned* rbuf, int rc, Comm comm ); -template void ReduceScatter( long int* sbuf, long int* rbuf, int rc, Comm comm ); -template void ReduceScatter( unsigned long* sbuf, unsigned long* rbuf, int rc, Comm comm ); +template void ReduceScatter (byte * sbuf, byte * rbuf, + int rc, Op op, Comm comm); +template void ReduceScatter (int *sbuf, int *rbuf, int rc, + Op op, Comm comm); +template void ReduceScatter (unsigned *sbuf, + unsigned *rbuf, int rc, + Op op, Comm comm); +template void ReduceScatter (long int *sbuf, + long int *rbuf, int rc, + Op op, Comm comm); +template void ReduceScatter (unsigned long *sbuf, + unsigned long *rbuf, int rc, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( long long int* sbuf, long long int* rbuf, int rc, Comm comm ); -template void ReduceScatter( unsigned long long* sbuf, unsigned long long* rbuf, int rc, Comm comm ); -#endif -template void ReduceScatter( float* sbuf, float* rbuf, int rc, Comm comm ); -template void ReduceScatter( double* sbuf, double* rbuf, int rc, Comm comm ); -template void ReduceScatter( Complex* sbuf, Complex* rbuf, int rc, Comm comm ); -template void ReduceScatter( Complex* sbuf, Complex* rbuf, int rc, Comm comm ); - -template -T ReduceScatter( T sb, Op op, Comm comm ) -{ T rb; ReduceScatter( &sb, &rb, 1, op, comm ); return rb; } - -template byte ReduceScatter( byte sb, Op op, Comm comm ); -template int ReduceScatter( int sb, Op op, Comm comm ); -template unsigned ReduceScatter( unsigned sb, Op op, Comm comm ); -template long int ReduceScatter( long int sb, Op op, Comm comm ); -template unsigned long ReduceScatter( unsigned long sb, Op op, Comm comm ); +template void ReduceScatter (long long int *sbuf, + long long int *rbuf, int rc, + Op op, Comm comm); +template void ReduceScatter (unsigned long long *sbuf, + unsigned long long *rbuf, + int rc, Op op, Comm comm); +#endif +template void ReduceScatter (float *sbuf, float *rbuf, + int rc, Op op, Comm comm); +template void ReduceScatter (double *sbuf, double *rbuf, + int rc, Op op, Comm comm); +template void ReduceScatter (Complex < float >*sbuf, + Complex < float >*rbuf, + int rc, Op op, Comm comm); +template void ReduceScatter (Complex < double >*sbuf, + Complex < double >*rbuf, + int rc, Op op, Comm comm); + +template < typename T > +void ReduceScatter (T * sbuf, T * rbuf, int rc, + Comm comm) +{ + ReduceScatter (sbuf, rbuf, rc, mpi::SUM, comm); +} + +template void ReduceScatter (byte * sbuf, byte * rbuf, + int rc, Comm comm); +template void ReduceScatter (int *sbuf, int *rbuf, int rc, + Comm comm); +template void ReduceScatter (unsigned *sbuf, + unsigned *rbuf, int rc, + Comm comm); +template void ReduceScatter (long int *sbuf, + long int *rbuf, int rc, + Comm comm); +template void ReduceScatter (unsigned long *sbuf, + unsigned long *rbuf, int rc, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int ReduceScatter( long long int sb, Op op, Comm comm ); -template unsigned long long ReduceScatter( unsigned long long sb, Op op, Comm comm ); -#endif -template float ReduceScatter( float sb, Op op, Comm comm ); -template double ReduceScatter( double sb, Op op, Comm comm ); -template Complex ReduceScatter( Complex sb, Op op, Comm comm ); -template Complex ReduceScatter( Complex sb, Op op, Comm comm ); - -template -T ReduceScatter( T sb, Comm comm ) -{ return ReduceScatter( sb, mpi::SUM, comm ); } - -template byte ReduceScatter( byte sb, Comm comm ); -template int ReduceScatter( int sb, Comm comm ); -template unsigned ReduceScatter( unsigned sb, Comm comm ); -template long int ReduceScatter( long int sb, Comm comm ); -template unsigned long ReduceScatter( unsigned long sb, Comm comm ); +template void ReduceScatter (long long int *sbuf, + long long int *rbuf, int rc, + Comm comm); +template void ReduceScatter (unsigned long long *sbuf, + unsigned long long *rbuf, + int rc, Comm comm); +#endif +template void ReduceScatter (float *sbuf, float *rbuf, + int rc, Comm comm); +template void ReduceScatter (double *sbuf, double *rbuf, + int rc, Comm comm); +template void ReduceScatter (Complex < float >*sbuf, + Complex < float >*rbuf, + int rc, Comm comm); +template void ReduceScatter (Complex < double >*sbuf, + Complex < double >*rbuf, + int rc, Comm comm); + +template < typename T > T ReduceScatter (T sb, Op op, + Comm comm) +{ + T rb; + + ReduceScatter (&sb, &rb, 1, op, comm); + return rb; +} + +template byte ReduceScatter (byte sb, Op op, Comm comm); +template int ReduceScatter (int sb, Op op, Comm comm); +template unsigned ReduceScatter (unsigned sb, Op op, + Comm comm); +template long int ReduceScatter (long int sb, Op op, + Comm comm); +template unsigned long ReduceScatter (unsigned long sb, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int ReduceScatter( long long int sb, Comm comm ); -template unsigned long long ReduceScatter( unsigned long long sb, Comm comm ); +template long long int ReduceScatter (long long int sb, + Op op, Comm comm); +template unsigned long long ReduceScatter (unsigned long + long sb, Op op, + Comm comm); #endif -template float ReduceScatter( float sb, Comm comm ); -template double ReduceScatter( double sb, Comm comm ); -template Complex ReduceScatter( Complex sb, Comm comm ); -template Complex ReduceScatter( Complex sb, Comm comm ); +template float ReduceScatter (float sb, Op op, Comm comm); +template double ReduceScatter (double sb, Op op, + Comm comm); +template Complex < float >ReduceScatter (Complex < + float >sb, Op op, + Comm comm); +template Complex < double >ReduceScatter (Complex < + double >sb, + Op op, + Comm comm); + +template < typename T > T ReduceScatter (T sb, Comm comm) +{ + return ReduceScatter (sb, mpi::SUM, comm); +} -template -void ReduceScatter( R* buf, int rc, Op op, Comm comm ) +template byte ReduceScatter (byte sb, Comm comm); +template int ReduceScatter (int sb, Comm comm); +template unsigned ReduceScatter (unsigned sb, Comm comm); +template long int ReduceScatter (long int sb, Comm comm); +template unsigned long ReduceScatter (unsigned long sb, + Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template long long int ReduceScatter (long long int sb, + Comm comm); +template unsigned long long ReduceScatter (unsigned long + long sb, + Comm comm); +#endif +template float ReduceScatter (float sb, Comm comm); +template double ReduceScatter (double sb, Comm comm); +template Complex < float >ReduceScatter (Complex < + float >sb, + Comm comm); +template Complex < double >ReduceScatter (Complex < + double >sb, + Comm comm); + +template < typename R > +void ReduceScatter (R * buf, int rc, Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_REDUCE_SCATTER_BLOCK_VIA_ALLREDUCE - const int commSize = Size( comm ); - const int commRank = Rank( comm ); - AllReduce( buf, rc*commSize, op, comm ); - if( commRank != 0 ) - MemCopy( buf, &buf[commRank*rc], rc ); + const int commSize = Size (comm); + const int commRank = Rank (comm); + + AllReduce (buf, rc * commSize, op, comm); + if (commRank != 0) + MemCopy (buf, &buf[commRank * rc], rc); #elif defined(EL_HAVE_MPI_REDUCE_SCATTER_BLOCK) -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce_scatter_block - ( MPI_IN_PLACE, buf, rc, TypeMap(), op.op, comm.comm ) ); -# else - const int commSize = Size( comm ); - std::vector sendBuf( rc*commSize ); - MemCopy( sendBuf.data(), buf, rc*commSize ); + (MPI_Reduce_scatter_block + (MPI_IN_PLACE, buf, rc, TypeMap < R > (), op.op, + comm.comm)); +#else + const int commSize = Size (comm); + + std::vector < R > sendBuf (rc * commSize); + MemCopy (sendBuf.data (), buf, rc * commSize); SafeMpi - ( MPI_Reduce_scatter_block - ( sendBuf.data(), buf, rc, TypeMap(), op.op, comm.comm ) ); -# endif + (MPI_Reduce_scatter_block + (sendBuf.data (), buf, rc, TypeMap < R > (), + op.op, comm.comm)); +#endif #else - const int commSize = Size( comm ); - Reduce( buf, rc*commSize, op, 0, comm ); - Scatter( buf, rc, rc, 0, comm ); + const int commSize = Size (comm); + + Reduce (buf, rc * commSize, op, 0, comm); + Scatter (buf, rc, rc, 0, comm); #endif } // TODO: Handle case where op is not summation -template -void ReduceScatter( Complex* buf, int rc, Op op, Comm comm ) +template < typename R > +void ReduceScatter (Complex < R > *buf, int rc, Op op, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_REDUCE_SCATTER_BLOCK_VIA_ALLREDUCE - const int commSize = Size( comm ); - const int commRank = Rank( comm ); - AllReduce( buf, rc*commSize, op, comm ); - if( commRank != 0 ) - MemCopy( buf, &buf[commRank*rc], rc ); + const int commSize = Size (comm); + const int commRank = Rank (comm); + + AllReduce (buf, rc * commSize, op, comm); + if (commRank != 0) + MemCopy (buf, &buf[commRank * rc], rc); #elif defined(EL_HAVE_MPI_REDUCE_SCATTER_BLOCK) -# ifdef EL_AVOID_COMPLEX_MPI -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_AVOID_COMPLEX_MPI +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce_scatter_block - ( MPI_IN_PLACE, buf, 2*rc, TypeMap(), op.op, comm.comm ) ); -# else - const int commSize = Size( comm ); - std::vector> sendBuf( rc*commSize ); - MemCopy( sendBuf.data(), buf, rc*commSize ); + (MPI_Reduce_scatter_block + (MPI_IN_PLACE, buf, 2 * rc, TypeMap < R > (), + op.op, comm.comm)); +#else + const int commSize = Size (comm); + + std::vector < Complex < R >> sendBuf (rc * commSize); + MemCopy (sendBuf.data (), buf, rc * commSize); SafeMpi - ( MPI_Reduce_scatter_block - ( sendBuf.data(), buf, 2*rc, TypeMap(), op.op, comm.comm ) ); -# endif -# else -# ifdef EL_HAVE_MPI_IN_PLACE + (MPI_Reduce_scatter_block + (sendBuf.data (), buf, 2 * rc, TypeMap < R > (), + op.op, comm.comm)); +#endif +#else +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce_scatter_block - ( MPI_IN_PLACE, buf, rc, TypeMap>(), op.op, comm.comm ) ); -# else - const int commSize = Size( comm ); - std::vector> sendBuf( rc*commSize ); - MemCopy( sendBuf.data(), buf, rc*commSize ); + (MPI_Reduce_scatter_block + (MPI_IN_PLACE, buf, rc, + TypeMap < Complex < R >> (), op.op, comm.comm)); +#else + const int commSize = Size (comm); + + std::vector < Complex < R >> sendBuf (rc * commSize); + MemCopy (sendBuf.data (), buf, rc * commSize); SafeMpi - ( MPI_Reduce_scatter_block - ( sendBuf.data(), buf, rc, TypeMap>(), op.op, comm.comm ) ); -# endif -# endif + (MPI_Reduce_scatter_block + (sendBuf.data (), buf, rc, + TypeMap < Complex < R >> (), op.op, comm.comm)); +#endif +#endif #else - const int commSize = Size( comm ); - Reduce( buf, rc*commSize, op, 0, comm ); - Scatter( buf, rc, rc, 0, comm ); + const int commSize = Size (comm); + + Reduce (buf, rc * commSize, op, 0, comm); + Scatter (buf, rc, rc, 0, comm); #endif } -template void ReduceScatter( byte* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( int* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( long int* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned long* buf, int rc, Op op, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( long long int* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned long long* buf, int rc, Op op, Comm comm ); -#endif -template void ReduceScatter( float* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( double* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( Complex* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( Complex* buf, int rc, Op op, Comm comm ); - -template -void ReduceScatter( T* buf, int rc, Comm comm ) -{ ReduceScatter( buf, rc, mpi::SUM, comm ); } - -template void ReduceScatter( byte* buf, int rc, Comm comm ); -template void ReduceScatter( int* buf, int rc, Comm comm ); -template void ReduceScatter( unsigned* buf, int rc, Comm comm ); -template void ReduceScatter( long int* buf, int rc, Comm comm ); -template void ReduceScatter( unsigned long* buf, int rc, Comm comm ); +template void ReduceScatter (byte * buf, int rc, Op op, + Comm comm); +template void ReduceScatter (int *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (unsigned *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (long int *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (unsigned long *buf, int rc, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( long long int* buf, int rc, Comm comm ); -template void ReduceScatter( unsigned long long* buf, int rc, Comm comm ); +template void ReduceScatter (long long int *buf, int rc, + Op op, Comm comm); +template void ReduceScatter (unsigned long long *buf, + int rc, Op op, Comm comm); #endif -template void ReduceScatter( float* buf, int rc, Comm comm ); -template void ReduceScatter( double* buf, int rc, Comm comm ); -template void ReduceScatter( Complex* buf, int rc, Comm comm ); -template void ReduceScatter( Complex* buf, int rc, Comm comm ); +template void ReduceScatter (float *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (double *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (Complex < float >*buf, + int rc, Op op, Comm comm); +template void ReduceScatter (Complex < double >*buf, + int rc, Op op, Comm comm); + +template < typename T > +void ReduceScatter (T * buf, int rc, Comm comm) +{ + ReduceScatter (buf, rc, mpi::SUM, comm); +} -template +template void ReduceScatter (byte * buf, int rc, + Comm comm); +template void ReduceScatter (int *buf, int rc, Comm comm); +template void ReduceScatter (unsigned *buf, int rc, + Comm comm); +template void ReduceScatter (long int *buf, int rc, + Comm comm); +template void ReduceScatter (unsigned long *buf, int rc, + Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template void ReduceScatter (long long int *buf, int rc, + Comm comm); +template void ReduceScatter (unsigned long long *buf, + int rc, Comm comm); +#endif +template void ReduceScatter (float *buf, int rc, + Comm comm); +template void ReduceScatter (double *buf, int rc, + Comm comm); +template void ReduceScatter (Complex < float >*buf, + int rc, Comm comm); +template void ReduceScatter (Complex < double >*buf, + int rc, Comm comm); + +template < typename R > void ReduceScatter -( const R* sbuf, R* rbuf, const int* rcs, Op op, Comm comm ) +(const R * sbuf, R * rbuf, const int *rcs, Op op, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) SafeMpi - ( MPI_Reduce_scatter - ( const_cast(sbuf), - rbuf, const_cast(rcs), TypeMap(), op.op, comm.comm ) ); + (MPI_Reduce_scatter + (const_cast < R * >(sbuf), + rbuf, const_cast < int *>(rcs), + TypeMap < R > (), op.op, comm.comm)); } -template +template < typename R > void ReduceScatter -( const Complex* sbuf, Complex* rbuf, const int* rcs, Op op, Comm comm ) +(const Complex < R > *sbuf, Complex < R > *rbuf, + const int *rcs, Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { int p; - MPI_Comm_size( comm.comm, &p ); - std::vector rcsDoubled(p); - for( int i=0; ircsDoubled (p); + + for (int i = 0; i < p; ++i) + rcsDoubled[i] = 2 * rcs[i]; SafeMpi - ( MPI_Reduce_scatter - ( const_cast*>(sbuf), - rbuf, rcsDoubled.data(), TypeMap(), op.op, comm.comm ) ); + (MPI_Reduce_scatter + (const_cast < Complex < R > *>(sbuf), + rbuf, rcsDoubled.data (), + TypeMap < R > (), op.op, comm.comm)); } else { SafeMpi - ( MPI_Reduce_scatter - ( const_cast*>(sbuf), - rbuf, const_cast(rcs), TypeMap>(), - op.op, comm.comm ) ); + (MPI_Reduce_scatter + (const_cast < Complex < R > *>(sbuf), + rbuf, const_cast < int *>(rcs), + TypeMap < Complex < R >> (), op.op, + comm.comm)); } #else SafeMpi - ( MPI_Reduce_scatter - ( const_cast*>(sbuf), - rbuf, const_cast(rcs), TypeMap>(), op.op, - comm.comm ) ); + (MPI_Reduce_scatter + (const_cast < Complex < R > *>(sbuf), + rbuf, const_cast < int *>(rcs), + TypeMap < Complex < R >> (), op.op, + comm.comm)); #endif } -template void ReduceScatter( const byte* sbuf, byte* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const int* sbuf, int* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const unsigned* sbuf, unsigned* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const long int* sbuf, long int* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const unsigned long* sbuf, unsigned long* rbuf, const int* rcs, Op op, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( const long long int* sbuf, long long int* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const unsigned long long* sbuf, unsigned long long* rbuf, const int* rcs, Op op, Comm comm ); -#endif -template void ReduceScatter( const float* sbuf, float* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const double* sbuf, double* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const Complex* sbuf, Complex* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const Complex* sbuf, Complex* rbuf, const int* rcs, Op op, Comm comm ); - -template -void ReduceScatter( const T* sbuf, T* rbuf, const int* rcs, Comm comm ) -{ ReduceScatter( sbuf, rbuf, rcs, mpi::SUM, comm ); } - -template void ReduceScatter( const byte* sbuf, byte* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const int* sbuf, int* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const unsigned* sbuf, unsigned* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const long int* sbuf, long int* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const unsigned long* sbuf, unsigned long* rbuf, const int* rcs, Comm comm ); +template void ReduceScatter (const byte * sbuf, + byte * rbuf, const int *rcs, + Op op, Comm comm); +template void ReduceScatter (const int *sbuf, int *rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const unsigned *sbuf, + unsigned *rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const long int *sbuf, + long int *rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const unsigned long *sbuf, + unsigned long *rbuf, + const int *rcs, Op op, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( const long long int* sbuf, long long int* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const unsigned long long* sbuf, unsigned long long* rbuf, const int* rcs, Comm comm ); +template void ReduceScatter (const long long int *sbuf, + long long int *rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const unsigned long long + *sbuf, + unsigned long long *rbuf, + const int *rcs, Op op, + Comm comm); #endif -template void ReduceScatter( const float* sbuf, float* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const double* sbuf, double* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const Complex* sbuf, Complex* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const Complex* sbuf, Complex* rbuf, const int* rcs, Comm comm ); +template void ReduceScatter (const float *sbuf, + float *rbuf, const int *rcs, + Op op, Comm comm); +template void ReduceScatter (const double *sbuf, + double *rbuf, const int *rcs, + Op op, Comm comm); +template void ReduceScatter (const Complex < float >*sbuf, + Complex < float >*rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const Complex < + double >*sbuf, + Complex < double >*rbuf, + const int *rcs, Op op, + Comm comm); + +template < typename T > +void ReduceScatter (const T * sbuf, T * rbuf, + const int *rcs, Comm comm) +{ + ReduceScatter (sbuf, rbuf, rcs, mpi::SUM, comm); +} -} // namespace mpi -} // namespace El +template void ReduceScatter (const byte * sbuf, + byte * rbuf, const int *rcs, + Comm comm); +template void ReduceScatter (const int *sbuf, int *rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const unsigned *sbuf, + unsigned *rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const long int *sbuf, + long int *rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const unsigned long *sbuf, + unsigned long *rbuf, + const int *rcs, Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template void ReduceScatter (const long long int *sbuf, + long long int *rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const unsigned long long + *sbuf, + unsigned long long *rbuf, + const int *rcs, Comm comm); +#endif +template void ReduceScatter (const float *sbuf, + float *rbuf, const int *rcs, + Comm comm); +template void ReduceScatter (const double *sbuf, + double *rbuf, const int *rcs, + Comm comm); +template void ReduceScatter (const Complex < float >*sbuf, + Complex < float >*rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const Complex < + double >*sbuf, + Complex < double >*rbuf, + const int *rcs, Comm comm); + +} // namespace mpi +} // namespace El From b3e2ec97979dfbb674107f92e81e4fdc57949a44 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 26 Jun 2014 00:10:55 -0500 Subject: [PATCH 006/110] forgot a const --- src/core/AxpyInterface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index f7e30efac4..2126108ff3 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -191,7 +191,7 @@ namespace El void AxpyInterface < T >::HandleGlobalToLocalRequest () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalRequest")) - DistMatrix < T > &X = *globalToLocalMat_; + const DistMatrix < T > &X = *globalToLocalMat_; const Grid & g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); From 1f9eb064850b9b826f7231d45f95c2dfc071495f Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 26 Jun 2014 15:46:41 -0500 Subject: [PATCH 007/110] populating/fixing rmainterface... --- include/El/core/RmaInterface.hpp | 7 +-- include/El/core/imports/mpi.hpp | 9 +++- src/core/RmaInterface.cpp | 84 +++++++++++++++++++++++++++++--- src/core/imports/mpi.cpp | 21 ++++---- 4 files changed, 98 insertions(+), 23 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index c8ab37fe87..8668d2b9d5 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -40,15 +40,16 @@ class RmaInterface void Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ); void Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ); - void Flush( const Matrix& Z, Int i, Int j); + void Flush( const Matrix& Z, Int i, Int j ); + void Flush( const Matrix& Z ); void Detach(); private: mpi::Window window; std::vector getVector_, putVector_; - DistMatrix* localToGlobalMat_; - const DistMatrix* globalToLocalMat_; + DistMatrix* GlobalArrayPut_; + const DistMatrix* GlobalArrayGet_; }; } // namespace El diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index db35336503..7ab47790ce 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -85,6 +85,7 @@ typedef enum PARTIAL_ACC_ORDERING = 2, NO_ACC_ORDERING = 4 } acc_order_t; + //TODO update these const int MAX_OUTSTANDING_NB = 100000; const int FLUSH_FREQUENCY = 10000; @@ -190,6 +191,7 @@ void Translate //MPI-3 one-sided #if MPI_VERSION>=3 +// Window creation/update/delete void SetWindowProp (Window& window, int prop); //NOTE assuming MPI_MODE_NOCHECK void WindowLock( int rank, Window& window ); @@ -198,6 +200,8 @@ void WindowUnlock( int rank, Window& window ); void WindowUnlock( Window& window ); void WindowCreate( int size, Comm comm, Window& window ); void WindowCreate( int size, Info info, Comm comm, Window& window ); +void WindowFree (Window & window); +// One-sided operations void Iput( void *source, int source_size, int target_rank, int target_size, Window& window); void Rput( void *source, int source_size, int target_rank, int target_size, @@ -210,8 +214,9 @@ void Iacc( void *source, int source_size, int target_rank, int target_size, Op &op, Window& window); void Racc( void *source, int source_size, int target_rank, int target_size, Op &op, Window& window, Request& request); -void Flush( int target_rank, Window& window, bool isLocalCompletion ); -void Flush( Window& window, bool isLocalCompletion ); +// Synchronization +void Flush( int target_rank, Window& window ); +void Flush (Window & window); #endif // Utilities diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 11a17ccf06..c24f7cb648 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -29,14 +29,20 @@ template RmaInterface::RmaInterface( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - const Int p = Z.Grid ().Size (); + GlobalArrayPut_ = &Z; + + const Int p = Z.Grid().Size(); + putVector_.resize( p ); } template RmaInterface::RmaInterface( const DistMatrix& X ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - const Int p = X.Grid ().Size (); + GlobalArrayGet_ = &X; + + const Int p = X.Grid ().Size (); + getVector_.resize( p ); } template @@ -66,13 +72,30 @@ template void RmaInterface::Attach( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - + GlobalArrayPut_ = &Z; + + const Grid& g = Z.Grid(); + + //do rma related checks + // find the size of the allocated window + const Int windowsize = Z.LocalHeight () * Z.LocalWidth () * sizeof (T); + mpi::WindowCreate (windowsize, g.VCComm (), window); + mpi::WindowLock (window); } template void RmaInterface::Attach( const DistMatrix& X ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) + GlobalArrayGet_ = &X; + + const Grid& g = X.Grid(); + + //do rma related checks + // find the size of the allocated window + const Int windowsize = X.LocalHeight () * X.LocalWidth () * sizeof (T); + mpi::WindowCreate (windowsize, g.VCComm (), window); + mpi::WindowLock (window); } template @@ -80,7 +103,7 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - DistMatrix& Y = *localToGlobalMat_; + DistMatrix& Y = *GlobalArrayPut_; if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) @@ -159,7 +182,7 @@ template void RmaInterface::Get( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) - const DistMatrix < T > &X = *globalToLocalMat_; + const DistMatrix < T > &X = *GlobalArrayGet_; const Int height = Z.Height (); const Int width = Z.Width (); @@ -229,7 +252,7 @@ void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) - DistMatrix& Y = *localToGlobalMat_; + DistMatrix& Y = *GlobalArrayPut_; if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) @@ -308,12 +331,61 @@ template void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + DistMatrix& Y = *GlobalArrayPut_; + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // put local matrix cells in + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step +void RmaInterface::Flush( const Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + mpi::Flush (window); } template void RmaInterface::Detach() { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) + + mpi::WindowUnlock (window); + mpi::WindowFree (window); } template class RmaInterface; diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 7474570bd8..0a59e50cf2 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -480,6 +480,11 @@ void WindowCreate (int size, Info info, Comm comm, SafeMpi (MPI_Barrier (comm.comm)); } +void WindowFree (Window & window) +{ + SafeMpi (MPI_Win_free (&window)); +} + void Iput (void *source, int source_size, int target_rank, int target_size, Window & window) { @@ -575,24 +580,16 @@ void Racc (void *source, int source_size, int target_rank, window, &request)); } -void Flush (int target_rank, Window & window, - bool isLocalCompletion) +void Flush (int target_rank, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) - if (isLocalCompletion) - SafeMpi (MPI_Win_flush_local - (target_rank, window)); - else - SafeMpi (MPI_Win_flush (target_rank, window)); + SafeMpi (MPI_Win_flush (target_rank, window)); } -void Flush (Window & window, bool isLocalCompletion) +void Flush (Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) - if (isLocalCompletion) - SafeMpi (MPI_Win_flush_local_all (window)); - else - SafeMpi (MPI_Win_flush_all (window)); + SafeMpi (MPI_Win_flush_all (window)); } #endif From e636c1d5f7ddeddd198cd0de4be45c5c568fdef2 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Fri, 27 Jun 2014 00:10:51 -0500 Subject: [PATCH 008/110] intermediate commits --- include/El/core/imports/mpi.hpp | 9 +- src/core/RmaInterface.cpp | 735 ++++++++++++++++---------------- src/core/imports/mpi.cpp | 49 ++- 3 files changed, 410 insertions(+), 383 deletions(-) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 7ab47790ce..94ac616cb5 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -95,6 +95,7 @@ typedef MPI_Info Info; const int ANY_SOURCE = MPI_ANY_SOURCE; const int ANY_TAG = MPI_ANY_TAG; const int ERR_RANK = MPI_ERR_RANK; +const int BOTTOM = MPI_BOTTOM; #ifdef EL_HAVE_MPI_QUERY_THREAD const int THREAD_SINGLE = MPI_THREAD_SINGLE; const int THREAD_FUNNELED = MPI_THREAD_FUNNELED; @@ -191,15 +192,17 @@ void Translate //MPI-3 one-sided #if MPI_VERSION>=3 -// Window creation/update/delete +// Utilities void SetWindowProp (Window& window, int prop); +void CheckBounds (Window & window, Datatype win_type, Datatype type, +size_t count, ptrdiff_t target_offset); //NOTE assuming MPI_MODE_NOCHECK +// Window creation/update/delete void WindowLock( int rank, Window& window ); void WindowLock( Window& window ); void WindowUnlock( int rank, Window& window ); void WindowUnlock( Window& window ); -void WindowCreate( int size, Comm comm, Window& window ); -void WindowCreate( int size, Info info, Comm comm, Window& window ); +void WindowCreate( void* baseptr, int size, Comm comm, Window& window ); void WindowFree (Window & window); // One-sided operations void Iput( void *source, int source_size, int target_rank, diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index c24f7cb648..ce8e736a53 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -4,15 +4,15 @@ Copyright (c) 2014, Jeff Hammond (Intel) All rights reserved. - Authors: - Jeff Hammond adapted the RMA interface from the AXPY one. +Authors: +Jeff Hammond adapted the RMA interface from the AXPY one. - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at - http://opensource.org/licenses/BSD-2-Clause +This file is part of Elemental and is under the BSD 2-Clause License, +which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause */ #include "El-lite.hpp" - + // This is direct copy-paste from // El two-sided implementation with // point-to-point replaced by one-sided @@ -25,374 +25,379 @@ namespace El { // dont care about const // interfaces now -template -RmaInterface::RmaInterface( DistMatrix& Z ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - GlobalArrayPut_ = &Z; - - const Int p = Z.Grid().Size(); - putVector_.resize( p ); -} - -template -RmaInterface::RmaInterface( const DistMatrix& X ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - GlobalArrayGet_ = &X; - - const Int p = X.Grid ().Size (); - getVector_.resize( p ); -} - -template -RmaInterface::~RmaInterface() -{ - { - if( std::uncaught_exception() ) - { - std::ostringstream os; - os << "Uncaught exception detected during RmaInterface destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << std::endl; - std::cerr << os.str(); - DEBUG_ONLY(DumpCallStack()) - } - else - { - Detach(); - } - } -} - -template -void RmaInterface::Attach( DistMatrix& Z ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - GlobalArrayPut_ = &Z; - - const Grid& g = Z.Grid(); - - //do rma related checks - // find the size of the allocated window - const Int windowsize = Z.LocalHeight () * Z.LocalWidth () * sizeof (T); - mpi::WindowCreate (windowsize, g.VCComm (), window); - mpi::WindowLock (window); -} - -template -void RmaInterface::Attach( const DistMatrix& X ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - GlobalArrayGet_ = &X; - - const Grid& g = X.Grid(); - - //do rma related checks - // find the size of the allocated window - const Int windowsize = X.LocalHeight () * X.LocalWidth () * sizeof (T); - mpi::WindowCreate (windowsize, g.VCComm (), window); - mpi::WindowLock (window); -} - -template -void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - - DistMatrix& Y = *GlobalArrayPut_; - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); - - //do rma related checks - - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // put local matrix cells in - // correct places in global array - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step(head) = i; head += sizeof(Int); - *reinterpret_cast(head) = j; head += sizeof(Int); - *reinterpret_cast(head) = height; head += sizeof(Int); - *reinterpret_cast(head) = width; head += sizeof(Int); - *reinterpret_cast(head) = alpha; head += sizeof(T); - - // Pack the payload - // consider ddt here - T* sendData = reinterpret_cast(head); - const T* XBuffer = Y.LockedBuffer(); - const Int XLDim = Y.LDim(); - for( Int t=0; t + RmaInterface::RmaInterface( DistMatrix& Z ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) + GlobalArrayPut_ = &Z; + + const Int p = Z.Grid().Size(); + putVector_.resize( p ); } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; - } -} - -template -void RmaInterface::Put( T alpha, const Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) -} - -template -void RmaInterface::Get( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) - const DistMatrix < T > &X = *GlobalArrayGet_; - - const Int height = Z.Height (); - const Int width = Z.Width (); - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid AxpyGlobalToLocal submatrix"); - - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - - for (Int rank = 0; rank < p; ++rank) - { - // this is horrendously wrong, but - // just for compiling - const Int buffersize = height * width * sizeof(T); - getVector_.resize (buffersize); - byte *getBuffer = getVector_.data (); - // how do we know the data size - mpi::Iget (getBuffer, buffersize, rank, buffersize, window); - // Extract the header - byte *head = getBuffer; - const Int i = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int j = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int height = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int width = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const T alpha = *reinterpret_cast < const T * >(head); - head += sizeof (T); - - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(head); - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); - - for (Int t = 0; t < localWidth; ++t) + template + RmaInterface::RmaInterface( const DistMatrix& X ) { - T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += alpha * XCol[s]; + DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) + GlobalArrayGet_ = &X; + + const Int p = X.Grid ().Size (); + getVector_.resize( p ); } - } -} - -template -void RmaInterface::Get( const Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) -} - -// scaled accumulate -template -void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) - - DistMatrix& Y = *GlobalArrayPut_; - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); - - //do rma related checks - - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // put local matrix cells in - // correct places in global array - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step(head) = i; head += sizeof(Int); - *reinterpret_cast(head) = j; head += sizeof(Int); - *reinterpret_cast(head) = height; head += sizeof(Int); - *reinterpret_cast(head) = width; head += sizeof(Int); - *reinterpret_cast(head) = alpha; head += sizeof(T); - - // Pack the payload - // consider ddt here - T* sendData = reinterpret_cast(head); - const T* XBuffer = Z.LockedBuffer(); - const Int XLDim = Z.LDim(); - for( Int t=0; t + RmaInterface::~RmaInterface() + { + { + if( std::uncaught_exception() ) + { + std::ostringstream os; + os << "Uncaught exception detected during RmaInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY(DumpCallStack()) + } + else + { + Detach(); + } + } } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; - } -} - -template -void RmaInterface::Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) -} - -template -void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - DistMatrix& Y = *GlobalArrayPut_; - - //do rma related checks - - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // put local matrix cells in - // correct places in global array - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step + void RmaInterface::Attach( DistMatrix& Z ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) + GlobalArrayPut_ = &Z; + const Grid& g = Z.Grid(); + + // return submatrix + //do rma related checks + const Int windowsize = Z.LocalHeight () * Z.LocalWidth () * sizeof (T); + + void* baseptr = (void *)Z.Buffer (); + + mpi::WindowCreate (baseptr, windowsize, g.VCComm (), window); + mpi::WindowLock (window); } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; - } -} - -template -void RmaInterface::Flush( const Matrix& Z ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - mpi::Flush (window); -} - -template -void RmaInterface::Detach() -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) + template + void RmaInterface::Attach( const DistMatrix& X ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) + GlobalArrayGet_ = &X; - mpi::WindowUnlock (window); - mpi::WindowFree (window); -} - -template class RmaInterface; -template class RmaInterface; -template class RmaInterface; -template class RmaInterface>; -template class RmaInterface>; + const DistMatrix < T > &Z = *GlobalArrayGet_; + + const Grid& g = X.Grid(); + + //do rma related checks + // find the size of the allocated window + const Int windowsize = X.LocalHeight () * X.LocalWidth () * sizeof (T); + void* baseptr = (void *)Z.LockedBuffer (); + mpi::WindowCreate (baseptr, windowsize, g.VCComm (), window); + mpi::WindowLock (window); + } + + template + void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) + + DistMatrix& Y = *GlobalArrayPut_; + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // put local matrix cells in + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step(head) = i; head += sizeof(Int); + *reinterpret_cast(head) = j; head += sizeof(Int); + *reinterpret_cast(head) = height; head += sizeof(Int); + *reinterpret_cast(head) = width; head += sizeof(Int); + *reinterpret_cast(head) = alpha; head += sizeof(T); + + // Pack the payload + // consider ddt here + T* sendData = reinterpret_cast(head); + const T* XBuffer = Y.LockedBuffer(); + const Int XLDim = Y.LDim(); + for( Int t=0; t + void RmaInterface::Put( T alpha, const Matrix& Z, Int i, Int j ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) + } + + template + void RmaInterface::Get( Matrix& Z, Int i, Int j ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) + const DistMatrix < T > &X = *GlobalArrayGet_; + + const Int height = Z.Height (); + const Int width = Z.Width (); + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid AxpyGlobalToLocal submatrix"); + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + + for (Int rank = 0; rank < p; ++rank) + { + // this is horrendously wrong, but + // just for compiling + const Int buffersize = height * width * sizeof(T); + getVector_.resize (buffersize); + byte *getBuffer = getVector_.data (); + // how do we know the data size + mpi::Iget (getBuffer, buffersize, rank, buffersize, window); + // Extract the header + byte *head = getBuffer; + const Int i = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int j = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int height = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int width = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const T alpha = *reinterpret_cast < const T * >(head); + head += sizeof (T); + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(head); + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += alpha * XCol[s]; + } + } + } + + template + void RmaInterface::Get( const Matrix& Z, Int i, Int j ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) + } + + // scaled accumulate + template + void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + + DistMatrix& Y = *GlobalArrayPut_; + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // put local matrix cells in + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step(head) = i; head += sizeof(Int); + *reinterpret_cast(head) = j; head += sizeof(Int); + *reinterpret_cast(head) = height; head += sizeof(Int); + *reinterpret_cast(head) = width; head += sizeof(Int); + *reinterpret_cast(head) = alpha; head += sizeof(T); + + // Pack the payload + // consider ddt here + T* sendData = reinterpret_cast(head); + const T* XBuffer = Z.LockedBuffer(); + const Int XLDim = Z.LDim(); + for( Int t=0; t + void RmaInterface::Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + } + + template + void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + DistMatrix& Y = *GlobalArrayPut_; + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // put local matrix cells in + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step + void RmaInterface::Flush( const Matrix& Z ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + mpi::Flush (window); + } + + template + void RmaInterface::Detach() + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) + + mpi::WindowUnlock (window); + mpi::WindowFree (window); + } + + template class RmaInterface; + template class RmaInterface; + template class RmaInterface; + template class RmaInterface>; + template class RmaInterface>; } // namespace El #endif diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 0a59e50cf2..e494c8d905 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -10,7 +10,7 @@ http://opensource.org/licenses/BSD-2-Clause */ #include "El-lite.hpp" - +#include typedef unsigned char *UCP; namespace @@ -454,35 +454,54 @@ void WindowUnlock (Window & window) SafeMpi (MPI_Win_unlock_all (window)); } -void WindowCreate (int size, Comm comm, Window & window) +// RMA Utilities +void WindowCreate (void *baseptr, int size, Comm comm, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Windowcreate")) - void *baseptr; - SafeMpi (MPI_Win_allocate - ((MPI_Aint) size, 1, MPI_INFO_NULL, comm.comm, - baseptr, &window)); + // use alloc_shm + SafeMpi (MPI_Win_create + (baseptr, (MPI_Aint) size, 1, MPI_INFO_NULL, + comm.comm, &window)); #ifdef EL_NO_ACC_ORDERING SetWindowProp (window, NO_ACC_ORDERING); #endif - SafeMpi (MPI_Barrier (comm.comm)); } -void WindowCreate (int size, Info info, Comm comm, - Window & window) +void CheckBounds (Window & window, Datatype win_type, Datatype type, +size_t count, ptrdiff_t target_offset) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Windowcreate")) - void *baseptr; + int flag, type_size, win_type_size; + size_t displ; + void * dest=NULL; - SafeMpi (MPI_Win_allocate - ((MPI_Aint) size, 1, info, comm.comm, baseptr, - &window)); - SafeMpi (MPI_Barrier (comm.comm)); + SafeMpi (MPI_Type_size (type, &type_size)); + SafeMpi (MPI_Type_size (win_type, &win_type_size)); + Aint lb, extent; + + SafeMpi (MPI_Win_get_attr(window, MPI_WIN_BASE, dest, &flag /* unused */)); + + /* Calculate displacement from beginning of the window */ + if (dest == MPI_BOTTOM) + displ = 0; + else + displ = (size_t) ((uint8_t*)(dest + target_offset * type_size) - (uint8_t*)dest); + + SafeMpi (MPI_Type_get_true_extent(type, &lb, &extent)); + + // invalid remote address + assert (displ >= 0 && displ < win_type_size); + // transfer out of range + assert (displ + count*extent <= win_type_size); } void WindowFree (Window & window) { + void* baseptr = NULL; + int flag; + SafeMpi (MPI_Win_get_attr(window, MPI_WIN_BASE, baseptr, &flag /* unused */)); SafeMpi (MPI_Win_free (&window)); + free (baseptr); } void Iput (void *source, int source_size, int target_rank, From aa88f1b829de49732c81d67e2dcaaa1db8b813e0 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 30 Jun 2014 10:23:23 -0500 Subject: [PATCH 009/110] updated macro names - use-barrier to use-nbc, also added provisions to use wait instead of test in nbc version --- include/El/core/AxpyInterface.hpp | 2 +- include/El/core/imports/mpi.hpp | 19 +++++++++++++------ src/core/AxpyInterface.cpp | 15 ++++++++++----- src/core/imports/mpi.cpp | 2 +- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index 867ba1cef5..c3cf52fbd6 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -48,7 +48,7 @@ class AxpyInterface DATA_REPLY_TAG =4; //request object for polling on Issends -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) bool all_sends_are_finished; #endif bool attachedForLocalToGlobal_, attachedForGlobalToLocal_; diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 94ac616cb5..9b8fe0da1d 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -29,12 +29,20 @@ namespace mpi { #endif #endif -//Use MPI-3 IBarrier instead of El strict EOM matching -#ifndef EL_USE_IBARRIER -#define EL_USE_IBARRIER +// Use MPI-3 IBarrier in developing a Non-blocking +// consensus instead of El strict EOM matching +// see - Scalable communication protocols for +// dynamic sparse data exchange by Hoefler, et al +#ifndef EL_USE_NONBLOCKING_CONSENSUS +#define EL_USE_NONBLOCKING_CONSENSUS #endif -//Experimental MPI performance enhancers +// TODO Give this a better name +#ifndef EL_PREFER_WAIT_OVER_TEST +#define EL_PREFER_WAIT_OVER_TEST +#endif + +// Experimental MPI performance enhancers #ifndef EL_MPI_EXPERIMENTAL #define EL_MPI_EXPERIMENTAL #endif @@ -95,7 +103,6 @@ typedef MPI_Info Info; const int ANY_SOURCE = MPI_ANY_SOURCE; const int ANY_TAG = MPI_ANY_TAG; const int ERR_RANK = MPI_ERR_RANK; -const int BOTTOM = MPI_BOTTOM; #ifdef EL_HAVE_MPI_QUERY_THREAD const int THREAD_SINGLE = MPI_THREAD_SINGLE; const int THREAD_FUNNELED = MPI_THREAD_FUNNELED; @@ -224,7 +231,7 @@ void Flush (Window & window); // Utilities void Barrier( Comm comm ); -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#if MPI_VERSION>=3 void IBarrier( Comm comm, Request& request ); #endif void Wait( Request& request ); diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 2126108ff3..252c3c0ed8 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -112,7 +112,7 @@ namespace El if (mpi::IProbe (mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status)) { -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = true; #endif // Message exists, so recv and pack @@ -425,7 +425,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) attachedForGlobalToLocal_ = true; globalToLocalMat_ = &X; } -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = false; #endif const Int p = X.Grid ().Size (); @@ -484,7 +484,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (i + X.Height () > Y.Height () || j + X.Width () > Y.Width ()) LogicError ("Submatrix out of bounds of global matrix"); -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = false; #endif const Grid & g = Y.Grid (); @@ -552,7 +552,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), dataSendRequests_[destination][index]); } -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = true; #endif receivingRow = (receivingRow + 1) % r; @@ -731,7 +731,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (attachedForLocalToGlobal_) { -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; @@ -741,7 +741,12 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (nb_bar_active) { // test for IBarrier completion +#if defined(EL_PREFER_WAIT_OVER_TEST) + mpi::Wait (nb_bar_request); + DONE = true; +#else DONE = mpi::Test (nb_bar_request); +#endif } else { diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index e494c8d905..8b2512a65e 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -622,7 +622,7 @@ void Barrier (Comm comm) SafeMpi (MPI_Barrier (comm.comm)); } -#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER) +#if MPI_VERSION>=3 void IBarrier (Comm comm, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::IBarrier")) From e4a5d6cf1fa23c2ceac4a4aaae9a80e9d3184ae1 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 30 Jun 2014 16:04:53 -0500 Subject: [PATCH 010/110] comments --- src/core/RmaInterface.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index ce8e736a53..d46b6811de 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -76,7 +76,7 @@ namespace El { const Grid& g = Z.Grid(); // return submatrix - //do rma related checks + // do rma related checks const Int windowsize = Z.LocalHeight () * Z.LocalWidth () * sizeof (T); void* baseptr = (void *)Z.Buffer (); @@ -85,6 +85,8 @@ namespace El { mpi::WindowLock (window); } + // TODO alpha in Put/Get is standing out and would hinder generalization + // can we circumvent this? template void RmaInterface::Attach( const DistMatrix& X ) { @@ -107,8 +109,7 @@ namespace El { void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - - DistMatrix& Y = *GlobalArrayPut_; + DistMatrix& Y = *GlobalArrayPut_; if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) @@ -245,6 +246,7 @@ namespace El { } } + // TODO will deal with const interfaces later template void RmaInterface::Get( const Matrix& Z, Int i, Int j ) { From d64331ec4b8286e5853b4a7e6aa5106aac7f7338 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 30 Jun 2014 18:34:18 -0500 Subject: [PATCH 011/110] forgot to add a constructor --- src/core/RmaInterface.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index d46b6811de..79da825683 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -25,6 +25,12 @@ namespace El { // dont care about const // interfaces now + template + RmaInterface::RmaInterface() + : GlobalArrayPut_(0), GlobalArrayGet_(0), + putVector_(0), getVector_(0), window (MPI_WIN_NULL) + { } + template RmaInterface::RmaInterface( DistMatrix& Z ) { From 6c844d7393205490b5c60477dccb6abfdf219eb1 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 1 Jul 2014 15:13:50 -0500 Subject: [PATCH 012/110] added a requestfree function to free spurious requests, also added a variant that waits instead of test in nbc...found a bug - when axpy dim < npes the nbc variant deadlocks...fixing it after some rma stuff --- include/El/core/imports/mpi.hpp | 2 +- src/core/AxpyInterface.cpp | 58 +++++++++++++++++++-------------- src/core/imports/mpi.cpp | 9 ++++- 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 9b8fe0da1d..89ac9de942 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -246,7 +246,7 @@ bool Testany( int count, Request* requests ); bool Testany( int count, Request* requests, int& indx ); bool Testany( int count, Request* requests, int& indx, Status& status ); bool IProbe( int source, int tag, Comm comm, Status& status ); - +void RequestFree( Request& request ); template int GetCount( Status& status ); diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 252c3c0ed8..b3ab8ba06e 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -551,6 +551,10 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) mpi::TaggedISSend (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), dataSendRequests_[destination][index]); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +// we won't use this request, so free it + mpi::RequestFree (dataSendRequests_[destination][index]); +#endif } #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = true; @@ -728,36 +732,40 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Grid & g = (attachedForLocalToGlobal_ ? localToGlobalMat_->Grid () : globalToLocalMat_-> Grid ()); - +// TODO Fix bug which causes deadlock in NBC version +// when AXPY_DIM < NPES. Also the Wait variant is buggy +// in it's case HandleLocalToGlobalData after the if loop +// causes it to work. if (attachedForLocalToGlobal_) { #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; - while (!DONE) - { - HandleLocalToGlobalData (); - if (nb_bar_active) - { - // test for IBarrier completion + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + + while (!DONE) + { + HandleLocalToGlobalData (); + if (nb_bar_active) + { + // test/wait for IBarrier completion #if defined(EL_PREFER_WAIT_OVER_TEST) - mpi::Wait (nb_bar_request); - DONE = true; + mpi::Wait (nb_bar_request); + DONE = true; #else - DONE = mpi::Test (nb_bar_request); -#endif - } - else - { - if (all_sends_are_finished) - { - // all ssends are complete, start nonblocking barrier - mpi::IBarrier (g.VCComm (), nb_bar_request); - nb_bar_active = true; - } - } - } + DONE = mpi::Test (nb_bar_request); +#endif + } + else + { + if (all_sends_are_finished) + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier (g.VCComm (), nb_bar_request); + nb_bar_active = true; + } + } + } #else while (!Finished ()) { diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 8b2512a65e..f777f2ac36 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -614,6 +614,12 @@ void Flush (Window & window) // Various utilities // ================= +// Free request +void RequestFree (Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::RequestFree")) + SafeMpi (MPI_Request_free (&request)); +} // Wait until every process in comm reaches this statement void Barrier (Comm comm) @@ -1085,7 +1091,8 @@ void TaggedISSend Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::ISSend")) - SafeMpi + + SafeMpi (MPI_Issend (const_cast < R * >(buf), count, TypeMap < R > (), to, tag, comm.comm, From e12bd085b030c460199773df2c7c750a7a63e9e0 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 1 Jul 2014 23:49:01 -0500 Subject: [PATCH 013/110] fixing put --- src/core/RmaInterface.cpp | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 79da825683..acb7dd9746 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -12,7 +12,7 @@ which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ #include "El-lite.hpp" - +#include // This is direct copy-paste from // El two-sided implementation with // point-to-point replaced by one-sided @@ -85,14 +85,16 @@ namespace El { // do rma related checks const Int windowsize = Z.LocalHeight () * Z.LocalWidth () * sizeof (T); + // TODO C++ way of type casting? void* baseptr = (void *)Z.Buffer (); + // TODO Use DEBUG_ONLY or something that EL provides + assert(baseptr != NULL); mpi::WindowCreate (baseptr, windowsize, g.VCComm (), window); mpi::WindowLock (window); } - // TODO alpha in Put/Get is standing out and would hinder generalization - // can we circumvent this? + // TODO Perhaps we should mention scale instead of alpha ala ARMCI template void RmaInterface::Attach( const DistMatrix& X ) { @@ -107,6 +109,8 @@ namespace El { // find the size of the allocated window const Int windowsize = X.LocalHeight () * X.LocalWidth () * sizeof (T); void* baseptr = (void *)Z.LockedBuffer (); + assert (baseptr != NULL); + mpi::WindowCreate (baseptr, windowsize, g.VCComm (), window); mpi::WindowLock (window); } @@ -136,6 +140,7 @@ namespace El { const Int height = Z.Height(); const Int width = Z.Width(); + std::cout << "After initial width/height functions...\n"; // put local matrix cells in // correct places in global array Int receivingRow = myProcessRow; @@ -144,8 +149,10 @@ namespace El { { const Int colShift = Shift( receivingRow, colAlign, r ); const Int rowShift = Shift( receivingCol, rowAlign, c ); + std::cout << "After colshift/rowshift...\n"; const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); + std::cout << "After Length...\n"; const Int numEntries = localHeight*localWidth; if( numEntries != 0 ) @@ -154,14 +161,18 @@ namespace El { const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); // Pack the header // make variable names rma friendly - byte* sendBuffer; + + putVector_.resize( bufferSize ); + byte* sendBuffer = putVector_.data(); byte* head = sendBuffer; + std::cout << "After pointing head to sendbuffer\n"; *reinterpret_cast(head) = i; head += sizeof(Int); *reinterpret_cast(head) = j; head += sizeof(Int); *reinterpret_cast(head) = height; head += sizeof(Int); *reinterpret_cast(head) = width; head += sizeof(Int); *reinterpret_cast(head) = alpha; head += sizeof(T); + std::cout << "Before packing payload\n"; // Pack the payload // consider ddt here T* sendData = reinterpret_cast(head); @@ -174,14 +185,18 @@ namespace El { for( Int s=0; s From f55685f1e1d6237cf80a978ecb7e7cbd3de40f6e Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 7 Jul 2014 10:39:08 -0500 Subject: [PATCH 014/110] modifications to fix errors in p/g/a --- include/El/core/RmaInterface.hpp | 5 +- src/core/AxpyInterface.cpp | 44 ++++---- src/core/RmaInterface.cpp | 175 ++++++++++++++++++------------- 3 files changed, 131 insertions(+), 93 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 8668d2b9d5..d7ef6dd828 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -40,8 +40,9 @@ class RmaInterface void Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ); void Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ); - void Flush( const Matrix& Z, Int i, Int j ); - void Flush( const Matrix& Z ); + //TODO const interfaces + void Flush( Matrix& Z, Int i, Int j ); + void Flush( Matrix& Z ); void Detach(); diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index b3ab8ba06e..d3652f2d97 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -514,14 +514,26 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Int destination = receivingRow + r * receivingCol; const Int bufferSize = 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); +// TODO the size of request object is set in this function +// bypassing it means passing same request handle multiple +// times, we don't care about it in NbC version though(?) +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + //const Int numCreated = dataVectors_[destination].size(); + const Int index = 0;//numCreated; + //dataVectors_[destination].resize (numCreated + 1); + //dataVectors_[numCreated].resize (bufferSize); + dataVectors_[0].resize (bufferSize); + //dataSendRequests_[destination].push_back (mpi::REQUEST_NULL); + //sendingData_[destination].push_back (true); +#else const Int index = - ReadyForSend (bufferSize, dataVectors_[destination], - dataSendRequests_[destination], - sendingData_[destination]); + ReadyForSend (bufferSize, dataVectors_[destination], + dataSendRequests_[destination], + sendingData_[destination]); +#endif DEBUG_ONLY (if (Int (dataVectors_[destination][index].size ()) != bufferSize) LogicError ("Error in ReadyForSend");) - // Pack the header byte *sendBuffer = dataVectors_[destination][index].data (); byte *head = sendBuffer; @@ -551,10 +563,10 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) mpi::TaggedISSend (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), dataSendRequests_[destination][index]); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +//#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) // we won't use this request, so free it - mpi::RequestFree (dataSendRequests_[destination][index]); -#endif +// mpi::RequestFree (dataSendRequests_[destination][index]); +//#endif } #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = true; @@ -668,14 +680,14 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) std::deque < bool > &requestStatuses) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::ReadyForSend")) - const Int numCreated = sendVectors.size (); + const Int numCreated = sendVectors.size (); DEBUG_ONLY (if (numCreated != Int (requests.size ()) || numCreated != Int (requestStatuses.size ()))LogicError ("size mismatch");) - for (Int i = 0; i < numCreated; ++i) + for (Int i = 0; i < numCreated; ++i) { - // If this request is still running, test to see if it finished. + // If this request is still running, test to see if it finished. if (requestStatuses[i]) { const bool finished = mpi::Test (requests[i]); @@ -689,12 +701,11 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) return i; } } - sendVectors.resize (numCreated + 1); sendVectors[numCreated].resize (sendSize); requests.push_back (mpi::REQUEST_NULL); requestStatuses.push_back (true); - + return numCreated; } @@ -733,9 +744,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) localToGlobalMat_->Grid () : globalToLocalMat_-> Grid ()); // TODO Fix bug which causes deadlock in NBC version -// when AXPY_DIM < NPES. Also the Wait variant is buggy -// in it's case HandleLocalToGlobalData after the if loop -// causes it to work. +// when for small AXPY_DIMs if (attachedForLocalToGlobal_) { #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) @@ -749,12 +758,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (nb_bar_active) { // test/wait for IBarrier completion -#if defined(EL_PREFER_WAIT_OVER_TEST) - mpi::Wait (nb_bar_request); - DONE = true; -#else DONE = mpi::Test (nb_bar_request); -#endif } else { diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index acb7dd9746..5cf9964db6 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -83,18 +83,18 @@ namespace El { // return submatrix // do rma related checks - const Int windowsize = Z.LocalHeight () * Z.LocalWidth () * sizeof (T); - + // extra for headers + const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); + const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); // TODO C++ way of type casting? void* baseptr = (void *)Z.Buffer (); // TODO Use DEBUG_ONLY or something that EL provides assert(baseptr != NULL); - mpi::WindowCreate (baseptr, windowsize, g.VCComm (), window); + mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); } - // TODO Perhaps we should mention scale instead of alpha ala ARMCI template void RmaInterface::Attach( const DistMatrix& X ) { @@ -106,12 +106,14 @@ namespace El { const Grid& g = X.Grid(); //do rma related checks - // find the size of the allocated window - const Int windowsize = X.LocalHeight () * X.LocalWidth () * sizeof (T); + // extra for headers + const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); + const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + void* baseptr = (void *)Z.LockedBuffer (); assert (baseptr != NULL); - mpi::WindowCreate (baseptr, windowsize, g.VCComm (), window); + mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); } @@ -140,7 +142,6 @@ namespace El { const Int height = Z.Height(); const Int width = Z.Width(); - std::cout << "After initial width/height functions...\n"; // put local matrix cells in // correct places in global array Int receivingRow = myProcessRow; @@ -149,10 +150,8 @@ namespace El { { const Int colShift = Shift( receivingRow, colAlign, r ); const Int rowShift = Shift( receivingCol, rowAlign, c ); - std::cout << "After colshift/rowshift...\n"; const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); - std::cout << "After Length...\n"; const Int numEntries = localHeight*localWidth; if( numEntries != 0 ) @@ -176,7 +175,7 @@ namespace El { // Pack the payload // consider ddt here T* sendData = reinterpret_cast(head); - const T* XBuffer = Y.LockedBuffer(); + const T* XBuffer = Y.Buffer(); const Int XLDim = Y.LDim(); for( Int t=0; t &X = *GlobalArrayGet_; - const Int height = Z.Height (); - const Int width = Z.Width (); - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid AxpyGlobalToLocal submatrix"); - const Grid & g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int p = g.Size (); const Int myRow = g.Row (); const Int myCol = g.Col (); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); - for (Int rank = 0; rank < p; ++rank) + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid AxpyGlobalToLocal submatrix"); + + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; + + // get into local matrix cells from + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step(head); - head += sizeof (Int); - const Int j = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int height = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int width = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const T alpha = *reinterpret_cast < const T * >(head); - head += sizeof (T); - - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(head); - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); - - for (Int t = 0; t < localWidth; ++t) + const Int colShift = Shift( receivingRow, colAlign, r ); + const Int rowShift = Shift( receivingCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int numEntries = localHeight*localWidth; + + if( numEntries != 0 ) { - T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += alpha * XCol[s]; + const Int destination = receivingRow + r*receivingCol; + const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + + getVector_.resize (bufferSize); + byte *getBuffer = getVector_.data (); + + mpi::Iget (getBuffer, bufferSize, destination, bufferSize, window); + //do we need flush here? + + // Extract the header + byte *head = getBuffer; + const Int i = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int j = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int height = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int width = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const T alpha = *reinterpret_cast < const T * >(head); + head += sizeof (T); + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(head); + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += alpha * XCol[s]; + } } + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; } + // Free the memory for the get buffer + getVector_.clear(); } // TODO will deal with const interfaces later @@ -318,8 +340,10 @@ namespace El { const Int destination = receivingRow + r*receivingCol; const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); // Pack the header - // make variable names rma friendly - byte* sendBuffer; + // make variable names rma friendly + putVector_.resize( bufferSize ); + byte* sendBuffer = putVector_.data(); + byte* head = sendBuffer; *reinterpret_cast(head) = i; head += sizeof(Int); *reinterpret_cast(head) = j; head += sizeof(Int); @@ -330,7 +354,7 @@ namespace El { // Pack the payload // consider ddt here T* sendData = reinterpret_cast(head); - const T* XBuffer = Z.LockedBuffer(); + const T* XBuffer = Z.Buffer(); const Int XLDim = Z.LDim(); for( Int t=0; t @@ -356,13 +383,12 @@ namespace El { } template - void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) + void RmaInterface::Flush( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) DistMatrix& Y = *GlobalArrayPut_; //do rma related checks - const Grid& g = Y.Grid(); const Int r = g.Height(); const Int c = g.Width(); @@ -376,8 +402,7 @@ namespace El { const Int height = Z.Height(); const Int width = Z.Width(); - // put local matrix cells in - // correct places in global array + // find destination Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; for( Int step=0; step - void RmaInterface::Flush( const Matrix& Z ) + void RmaInterface::Flush( Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + // rma checks, see if Z is not NULL mpi::Flush (window); } @@ -411,9 +437,16 @@ namespace El { void RmaInterface::Detach() { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) + //if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) + // LogicError ("Must attach before detaching."); + + //do rma related checks + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); - mpi::WindowUnlock (window); + mpi::WindowUnlock (window); mpi::WindowFree (window); + mpi::Barrier (g.VCComm ()); } template class RmaInterface; From 18598b750ac36dc3e982e1518b26bc63b4f718f4 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 7 Jul 2014 13:41:42 -0500 Subject: [PATCH 015/110] intermediate commit, cleaning up rma interface --- include/El/core/RmaInterface.hpp | 4 +- src/core/RmaInterface.cpp | 149 +++++++++++++++++++++++++------ src/core/imports/mpi.cpp | 2 +- 3 files changed, 127 insertions(+), 28 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index d7ef6dd828..370896cb88 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -40,9 +40,10 @@ class RmaInterface void Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ); void Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ); - //TODO const interfaces void Flush( Matrix& Z, Int i, Int j ); + void Flush( const Matrix& Z, Int i, Int j ); void Flush( Matrix& Z ); + void Flush( const Matrix& Z ); void Detach(); @@ -51,6 +52,7 @@ class RmaInterface std::vector getVector_, putVector_; DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; + bool attachedForPut_, attachedForGet_; }; } // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 5cf9964db6..b782638646 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -28,14 +28,19 @@ namespace El { template RmaInterface::RmaInterface() : GlobalArrayPut_(0), GlobalArrayGet_(0), - putVector_(0), getVector_(0), window (MPI_WIN_NULL) + putVector_(0), getVector_(0), window (MPI_WIN_NULL), + attachedForPut_(false), attachedForGet_(false) { } template RmaInterface::RmaInterface( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - GlobalArrayPut_ = &Z; + + attachedForGet_ = false; + attachedForPut_ = true; + GlobalArrayPut_ = &Z; + GlobalArrayGet_ = 0; const Int p = Z.Grid().Size(); putVector_.resize( p ); @@ -45,7 +50,11 @@ namespace El { RmaInterface::RmaInterface( const DistMatrix& X ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - GlobalArrayGet_ = &X; + + attachedForGet_ = true; + attachedForPut_ = false; + GlobalArrayGet_ = &X; + GlobalArrayPut_ = 0; const Int p = X.Grid ().Size (); getVector_.resize( p ); @@ -78,10 +87,11 @@ namespace El { void RmaInterface::Attach( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - GlobalArrayPut_ = &Z; + if( attachedForPut_ || attachedForGet_ ) + LogicError("Must detach before reattaching."); + + GlobalArrayPut_ = &Z; const Grid& g = Z.Grid(); - - // return submatrix // do rma related checks // extra for headers const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); @@ -93,15 +103,19 @@ namespace El { mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); + // do we need a barrier here? + mpi::Barrier (g.VCComm ()); } template void RmaInterface::Attach( const DistMatrix& X ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - GlobalArrayGet_ = &X; - - const DistMatrix < T > &Z = *GlobalArrayGet_; + if( attachedForPut_ || attachedForGet_ ) + LogicError("Must detach before reattaching."); + + GlobalArrayGet_ = &X; + const DistMatrix &Z = *GlobalArrayGet_; const Grid& g = X.Grid(); @@ -115,6 +129,8 @@ namespace El { mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); + // do we need a barrier here? + mpi::Barrier (g.VCComm ()); } template @@ -122,6 +138,8 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) DistMatrix& Y = *GlobalArrayPut_; + attachedForPut_ = true; + if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) @@ -204,8 +222,9 @@ namespace El { void RmaInterface::Get( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) - const DistMatrix < T > &X = *GlobalArrayGet_; - + const DistMatrix < T > &X = *GlobalArrayGet_; + attachedForGet_ = true; + const Grid & g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); @@ -302,7 +321,9 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) - DistMatrix& Y = *GlobalArrayPut_; + DistMatrix& Y = *GlobalArrayPut_; + attachedForPut_ = true; + if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) @@ -373,7 +394,6 @@ namespace El { } // Free the memory for the put buffer putVector_.clear(); - mpi::Barrier (g.VCComm ()); } template @@ -386,7 +406,56 @@ namespace El { void RmaInterface::Flush( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - DistMatrix& Y = *GlobalArrayPut_; + if( !attachedForPut_ || !attachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + DistMatrix& Y = *GlobalArrayPut_; + + //do rma related checks + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step + void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + if( !attachedForPut_ || !attachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + const DistMatrix& Y = *GlobalArrayGet_; //do rma related checks const Grid& g = Y.Grid(); @@ -411,13 +480,13 @@ namespace El { const Int rowShift = Shift( receivingCol, rowAlign, c ); const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); - //const Int numEntries = localHeight*localWidth; + const Int numEntries = localHeight*localWidth; - //if( numEntries != 0 ) - //{ + if( numEntries != 0 ) + { const Int destination = receivingRow + r*receivingCol; mpi::Flush ( destination, window ); - //} + } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -429,24 +498,52 @@ namespace El { void RmaInterface::Flush( Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - // rma checks, see if Z is not NULL - mpi::Flush (window); + + if( !attachedForPut_ || !attachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + // rma checks, see if Z is not NULL, etc + DistMatrix& Y = *GlobalArrayPut_; + + mpi::Flush (window); + } + + template + void RmaInterface::Flush( const Matrix& Z ) + { + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + + if( !attachedForPut_ || !attachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + // rma checks, see if Z is not NULL, etc + const DistMatrix& Y = *GlobalArrayGet_; + + mpi::Flush (window); } template void RmaInterface::Detach() { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) - //if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) - // LogicError ("Must attach before detaching."); - + if( !attachedForPut_ || !attachedForGet_ ) + LogicError("Must initiate transfer before flushing."); //do rma related checks - DistMatrix& Y = *GlobalArrayPut_; - const Grid& g = Y.Grid(); + + const Grid& g = ( attachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + + mpi::Barrier (g.VCComm ()); + + attachedForPut_ = false; + attachedForGet_ = false; + + putVector_.clear(); + getVector_.clear(); mpi::WindowUnlock (window); mpi::WindowFree (window); - mpi::Barrier (g.VCComm ()); } template class RmaInterface; diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index f777f2ac36..a923952959 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -485,7 +485,7 @@ size_t count, ptrdiff_t target_offset) if (dest == MPI_BOTTOM) displ = 0; else - displ = (size_t) ((uint8_t*)(dest + target_offset * type_size) - (uint8_t*)dest); + displ = (size_t) ((uint8_t*)((uint8_t*)dest + target_offset * type_size) - (uint8_t*)dest); SafeMpi (MPI_Type_get_true_extent(type, &lb, &extent)); From 5432b3f7efad9b7e39431d87bd8b42091720bbdc Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 7 Jul 2014 14:25:14 -0500 Subject: [PATCH 016/110] removed spurious instantiations --- src/core/RmaInterface.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index b782638646..fdba15a9ac 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -2,6 +2,7 @@ Copyright (c) 2009-2014, Jack Poulson Copyright (c) 2011, The University of Texas at Austin Copyright (c) 2014, Jeff Hammond (Intel) + Copyright (c) 2014, Sayan Ghosh (University of Houston) All rights reserved. Authors: @@ -13,13 +14,11 @@ which can be found in the LICENSE file in the root directory, or at */ #include "El-lite.hpp" #include + // This is direct copy-paste from // El two-sided implementation with // point-to-point replaced by one-sided -// If you're seeing this then at this -// point I just want to compile - #if MPI_VERSION>=3 namespace El { @@ -89,8 +88,9 @@ namespace El { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) if( attachedForPut_ || attachedForGet_ ) LogicError("Must detach before reattaching."); - + GlobalArrayPut_ = &Z; + const Grid& g = Z.Grid(); // do rma related checks // extra for headers @@ -115,16 +115,15 @@ namespace El { LogicError("Must detach before reattaching."); GlobalArrayGet_ = &X; - const DistMatrix &Z = *GlobalArrayGet_; const Grid& g = X.Grid(); //do rma related checks // extra for headers - const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); + const Int numEntries = X.LocalHeight () * X.LocalWidth (); const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); - void* baseptr = (void *)Z.LockedBuffer (); + void* baseptr = (void *)X.LockedBuffer (); assert (baseptr != NULL); mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); @@ -138,7 +137,6 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) DistMatrix& Y = *GlobalArrayPut_; - attachedForPut_ = true; if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -223,7 +221,6 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) const DistMatrix < T > &X = *GlobalArrayGet_; - attachedForGet_ = true; const Grid & g = X.Grid (); const Int r = g.Height (); @@ -322,7 +319,6 @@ namespace El { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) DistMatrix& Y = *GlobalArrayPut_; - attachedForPut_ = true; if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -527,7 +523,7 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) if( !attachedForPut_ || !attachedForGet_ ) - LogicError("Must initiate transfer before flushing."); + LogicError("Must attach before detaching."); //do rma related checks const Grid& g = ( attachedForPut_ ? From 8a36dd5fbe4c60e900c5970b9d6eb3c9db08dee8 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 7 Jul 2014 15:56:21 -0500 Subject: [PATCH 017/110] added some checks in p/g/a --- src/core/RmaInterface.cpp | 42 ++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index fdba15a9ac..cb1d07a2e7 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -40,9 +40,11 @@ namespace El { attachedForPut_ = true; GlobalArrayPut_ = &Z; GlobalArrayGet_ = 0; + window = MPI_WIN_NULL; - const Int p = Z.Grid().Size(); + const Int p = Z.Grid ().Size(); putVector_.resize( p ); + getVector_.resize( p ); } template @@ -54,9 +56,11 @@ namespace El { attachedForPut_ = false; GlobalArrayGet_ = &X; GlobalArrayPut_ = 0; - + window = MPI_WIN_NULL; + const Int p = X.Grid ().Size (); getVector_.resize( p ); + putVector_.resize( p ); } template @@ -90,7 +94,8 @@ namespace El { LogicError("Must detach before reattaching."); GlobalArrayPut_ = &Z; - + attachedForPut_ = true; + const Grid& g = Z.Grid(); // do rma related checks // extra for headers @@ -115,7 +120,8 @@ namespace El { LogicError("Must detach before reattaching."); GlobalArrayGet_ = &X; - + attachedForGet_ = true; + const Grid& g = X.Grid(); //do rma related checks @@ -136,14 +142,16 @@ namespace El { void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - DistMatrix& Y = *GlobalArrayPut_; if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + if ( !attachedForPut_ ) + LogicError("Global matrix cannot be updated"); + DistMatrix& Y = *GlobalArrayPut_; //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -220,6 +228,9 @@ namespace El { void RmaInterface::Get( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) + if ( !attachedForGet_ ) + LogicError("Local matrix cannot be updated"); + const DistMatrix < T > &X = *GlobalArrayGet_; const Grid & g = X.Grid (); @@ -318,10 +329,13 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) - DistMatrix& Y = *GlobalArrayPut_; - + if ( !attachedForPut_ ) + LogicError("Global matrix cannot be updated"); if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); + + DistMatrix& Y = *GlobalArrayPut_; + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) LogicError("Submatrix out of bounds of global matrix"); @@ -402,7 +416,7 @@ namespace El { void RmaInterface::Flush( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !attachedForPut_ || !attachedForGet_ ) + if( !attachedForPut_ && !attachedForGet_ ) LogicError("Must initiate transfer before flushing."); DistMatrix& Y = *GlobalArrayPut_; @@ -448,7 +462,7 @@ namespace El { void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !attachedForPut_ || !attachedForGet_ ) + if( !attachedForPut_ && !attachedForGet_ ) LogicError("Must initiate transfer before flushing."); const DistMatrix& Y = *GlobalArrayGet_; @@ -495,7 +509,7 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !attachedForPut_ || !attachedForGet_ ) + if( !attachedForPut_ && !attachedForGet_ ) LogicError("Must initiate transfer before flushing."); // rma checks, see if Z is not NULL, etc @@ -509,7 +523,7 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !attachedForPut_ || !attachedForGet_ ) + if( !attachedForPut_ && !attachedForGet_ ) LogicError("Must initiate transfer before flushing."); // rma checks, see if Z is not NULL, etc @@ -522,7 +536,7 @@ namespace El { void RmaInterface::Detach() { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) - if( !attachedForPut_ || !attachedForGet_ ) + if( !attachedForPut_ && !attachedForGet_ ) LogicError("Must attach before detaching."); //do rma related checks From 26034bd304317f46c5b0517c5e3951caa1dbf6e7 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 7 Jul 2014 17:37:10 -0500 Subject: [PATCH 018/110] intermediate commit --- src/core/RmaInterface.cpp | 40 +++++++++++++++++++-------------------- src/core/imports/mpi.cpp | 8 ++------ 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index cb1d07a2e7..8be6d5100d 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -14,16 +14,12 @@ which can be found in the LICENSE file in the root directory, or at */ #include "El-lite.hpp" #include - -// This is direct copy-paste from -// El two-sided implementation with -// point-to-point replaced by one-sided - + +// TODO complete the const interfaces... +// TODO RMA related checks pending (e.g bounds checking)... #if MPI_VERSION>=3 namespace El { - // dont care about const - // interfaces now template RmaInterface::RmaInterface() : GlobalArrayPut_(0), GlobalArrayGet_(0), @@ -92,10 +88,14 @@ namespace El { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) if( attachedForPut_ || attachedForGet_ ) LogicError("Must detach before reattaching."); - + + // if DistMatrix is non-const, all one-sided + // transfers -- put, get and acc are possible GlobalArrayPut_ = &Z; - attachedForPut_ = true; - + attachedForPut_ = true; + GlobalArrayGet_ = &Z; + attachedForGet_ = true; + const Grid& g = Z.Grid(); // do rma related checks // extra for headers @@ -229,9 +229,9 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) if ( !attachedForGet_ ) - LogicError("Local matrix cannot be updated"); + LogicError ("Cannot perform this operation as matrix is not attached."); - const DistMatrix < T > &X = *GlobalArrayGet_; + const DistMatrix &X = *GlobalArrayGet_; const Grid & g = X.Grid (); const Int r = g.Height (); @@ -247,7 +247,7 @@ namespace El { const Int width = Z.Width(); if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid AxpyGlobalToLocal submatrix"); + LogicError("Submatrix out of bounds of global matrix"); const Int colAlign = (X.ColAlign() + i) % r; const Int rowAlign = (X.RowAlign() + j) % c; @@ -538,22 +538,22 @@ namespace El { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) if( !attachedForPut_ && !attachedForGet_ ) LogicError("Must attach before detaching."); - //do rma related checks - + // Both attachedForPut_ and attachedForGet_ + // could be true, will it cause issues? const Grid& g = ( attachedForPut_ ? GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); - mpi::Barrier (g.VCComm ()); - attachedForPut_ = false; attachedForGet_ = false; - + + mpi::WindowUnlock (window); + mpi::WindowFree (window); + putVector_.clear(); getVector_.clear(); - mpi::WindowUnlock (window); - mpi::WindowFree (window); + mpi::Barrier (g.VCComm ()); } template class RmaInterface; diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index a923952959..ca05fe27fc 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -15,7 +15,7 @@ typedef unsigned char *UCP; namespace { - +//TODO RMA related checks inline void SafeMpi (int mpiError) { DEBUG_ONLY (if (mpiError != MPI_SUCCESS) @@ -457,7 +457,7 @@ void WindowUnlock (Window & window) // RMA Utilities void WindowCreate (void *baseptr, int size, Comm comm, Window & window) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Windowcreate")) + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowCreate")) // use alloc_shm SafeMpi (MPI_Win_create @@ -497,11 +497,7 @@ size_t count, ptrdiff_t target_offset) void WindowFree (Window & window) { - void* baseptr = NULL; - int flag; - SafeMpi (MPI_Win_get_attr(window, MPI_WIN_BASE, baseptr, &flag /* unused */)); SafeMpi (MPI_Win_free (&window)); - free (baseptr); } void Iput (void *source, int source_size, int target_rank, From 46648f996a7f71d0cddfd2d6863df64d29c6427d Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 7 Jul 2014 18:28:35 -0500 Subject: [PATCH 019/110] added a comment in Get --- src/core/RmaInterface.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 8be6d5100d..18bb8befef 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -228,6 +228,8 @@ namespace El { void RmaInterface::Get( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) + // a call to Attach with a non-const DistMatrix must set + // attachedForGet_ also, of not DistMatrix isn't attached if ( !attachedForGet_ ) LogicError ("Cannot perform this operation as matrix is not attached."); From e160ce66898bc7e05ea454e03d252743226d9a71 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 7 Jul 2014 18:56:18 -0500 Subject: [PATCH 020/110] fixed dtype of mpi-get, updated comment --- src/core/RmaInterface.cpp | 3 ++- src/core/imports/mpi.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 18bb8befef..69295ee495 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -229,7 +229,8 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) // a call to Attach with a non-const DistMatrix must set - // attachedForGet_ also, of not DistMatrix isn't attached + // attachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached if ( !attachedForGet_ ) LogicError ("Cannot perform this operation as matrix is not attached."); diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index ca05fe27fc..5307bc7868 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -547,7 +547,7 @@ void Iget (void *source, int source_size, int target_rank, MPI_BYTE, MPI_NO_OP, window)); #else SafeMpi (MPI_Get - (source, (MPI_Aint) source_size, 1, + (source, (MPI_Aint) source_size, MPI_BYTE, target_rank, 1, (MPI_Aint) target_size, MPI_BYTE, window)); #endif From 3fa7135ebea08e68e7034f34ab8056614202d0d6 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 8 Jul 2014 00:21:49 -0500 Subject: [PATCH 021/110] intermediate commit - introduce another variable to track attachments --- include/El/core/RmaInterface.hpp | 2 +- src/core/RmaInterface.cpp | 57 +++++++++++++++++--------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 370896cb88..634790ea5c 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -52,7 +52,7 @@ class RmaInterface std::vector getVector_, putVector_; DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; - bool attachedForPut_, attachedForGet_; + bool attachedForPut_, attachedForGet_, attached_; }; } // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 69295ee495..b0dbf36632 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -32,7 +32,7 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - attachedForGet_ = false; + attachedForGet_ = true; attachedForPut_ = true; GlobalArrayPut_ = &Z; GlobalArrayGet_ = 0; @@ -86,16 +86,20 @@ namespace El { void RmaInterface::Attach( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - if( attachedForPut_ || attachedForGet_ ) - LogicError("Must detach before reattaching."); - + if (attached_) + LogicError("Must detach before reattaching."); + else + attached_ = true; + // if DistMatrix is non-const, all one-sided // transfers -- put, get and acc are possible - GlobalArrayPut_ = &Z; - attachedForPut_ = true; - GlobalArrayGet_ = &Z; - attachedForGet_ = true; - + if( !attachedForPut_ && !attachedForGet_ ) + { + GlobalArrayPut_ = &Z; + GlobalArrayGet_ = &Z; + attachedForPut_ = true; + attachedForGet_ = true; + } const Grid& g = Z.Grid(); // do rma related checks // extra for headers @@ -108,20 +112,23 @@ namespace El { mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); - // do we need a barrier here? - mpi::Barrier (g.VCComm ()); } template void RmaInterface::Attach( const DistMatrix& X ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - if( attachedForPut_ || attachedForGet_ ) - LogicError("Must detach before reattaching."); - - GlobalArrayGet_ = &X; - attachedForGet_ = true; - + if (attached_) + LogicError("Must detach before reattaching."); + else + attached_ = true; + + if( !attachedForGet_ ) + { + GlobalArrayGet_ = &X; + attachedForGet_ = true; + } + const Grid& g = X.Grid(); //do rma related checks @@ -134,8 +141,6 @@ namespace El { mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); - // do we need a barrier here? - mpi::Barrier (g.VCComm ()); } template @@ -541,21 +546,21 @@ namespace El { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) if( !attachedForPut_ && !attachedForGet_ ) LogicError("Must attach before detaching."); - // Both attachedForPut_ and attachedForGet_ - // could be true, will it cause issues? + const Grid& g = ( attachedForPut_ ? GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); - + + mpi::WindowUnlock (window); + mpi::WindowFree (window); + attachedForPut_ = false; attachedForGet_ = false; + attached_ = false; - mpi::WindowUnlock (window); - mpi::WindowFree (window); - putVector_.clear(); getVector_.clear(); - + mpi::Barrier (g.VCComm ()); } From 54eba3eace82e306a1a1456e42511733ae21de5a Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 8 Jul 2014 09:49:40 -0500 Subject: [PATCH 022/110] bugs wrt get-put flags --- include/El/core/RmaInterface.hpp | 2 +- src/core/RmaInterface.cpp | 75 +++++++++++++++++--------------- 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 634790ea5c..238adf100e 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -52,7 +52,7 @@ class RmaInterface std::vector getVector_, putVector_; DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; - bool attachedForPut_, attachedForGet_, attached_; + bool toBeAttachedForPut_, toBeAttachedForGet_, attached_; }; } // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index b0dbf36632..337fcf6c2b 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -24,7 +24,8 @@ namespace El { RmaInterface::RmaInterface() : GlobalArrayPut_(0), GlobalArrayGet_(0), putVector_(0), getVector_(0), window (MPI_WIN_NULL), - attachedForPut_(false), attachedForGet_(false) + toBeAttachedForPut_(false), toBeAttachedForGet_(false), + attached_(false) { } template @@ -32,11 +33,11 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - attachedForGet_ = true; - attachedForPut_ = true; - GlobalArrayPut_ = &Z; - GlobalArrayGet_ = 0; - window = MPI_WIN_NULL; + toBeAttachedForGet_ = true; + toBeAttachedForPut_ = true; + GlobalArrayPut_ = &Z; + GlobalArrayGet_ = &Z; + window = MPI_WIN_NULL; const Int p = Z.Grid ().Size(); putVector_.resize( p ); @@ -48,11 +49,11 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - attachedForGet_ = true; - attachedForPut_ = false; - GlobalArrayGet_ = &X; - GlobalArrayPut_ = 0; - window = MPI_WIN_NULL; + toBeAttachedForGet_ = true; + toBeAttachedForPut_ = false; + GlobalArrayGet_ = &X; + GlobalArrayPut_ = 0; + window = MPI_WIN_NULL; const Int p = X.Grid ().Size (); getVector_.resize( p ); @@ -93,12 +94,12 @@ namespace El { // if DistMatrix is non-const, all one-sided // transfers -- put, get and acc are possible - if( !attachedForPut_ && !attachedForGet_ ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) { - GlobalArrayPut_ = &Z; - GlobalArrayGet_ = &Z; - attachedForPut_ = true; - attachedForGet_ = true; + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; } const Grid& g = Z.Grid(); // do rma related checks @@ -123,11 +124,15 @@ namespace El { else attached_ = true; - if( !attachedForGet_ ) + if( !toBeAttachedForGet_ && !toBeAttachedForPut_) { - GlobalArrayGet_ = &X; - attachedForGet_ = true; + GlobalArrayGet_ = &X; + toBeAttachedForGet_ = true; + GlobalArrayPut_ = 0; + toBeAttachedForPut_ = false; } + else + LogicError("Cannot update Global matrix."); const Grid& g = X.Grid(); @@ -150,7 +155,7 @@ namespace El { if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); - if ( !attachedForPut_ ) + if ( !toBeAttachedForPut_ ) LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; @@ -234,9 +239,9 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) // a call to Attach with a non-const DistMatrix must set - // attachedForGet_ also, if not then it is assumed that + // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached - if ( !attachedForGet_ ) + if ( !toBeAttachedForGet_ ) LogicError ("Cannot perform this operation as matrix is not attached."); const DistMatrix &X = *GlobalArrayGet_; @@ -337,15 +342,15 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) - if ( !attachedForPut_ ) - LogicError("Global matrix cannot be updated"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated."); if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError("Submatrix offsets must be non-negative."); DistMatrix& Y = *GlobalArrayPut_; if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError("Submatrix out of bounds of global matrix."); //do rma related checks @@ -424,7 +429,7 @@ namespace El { void RmaInterface::Flush( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !attachedForPut_ && !attachedForGet_ ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); DistMatrix& Y = *GlobalArrayPut_; @@ -470,7 +475,7 @@ namespace El { void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !attachedForPut_ && !attachedForGet_ ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); const DistMatrix& Y = *GlobalArrayGet_; @@ -517,7 +522,7 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !attachedForPut_ && !attachedForGet_ ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); // rma checks, see if Z is not NULL, etc @@ -531,7 +536,7 @@ namespace El { { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !attachedForPut_ && !attachedForGet_ ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); // rma checks, see if Z is not NULL, etc @@ -544,19 +549,19 @@ namespace El { void RmaInterface::Detach() { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) - if( !attachedForPut_ && !attachedForGet_ ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must attach before detaching."); - const Grid& g = ( attachedForPut_ ? + const Grid& g = ( toBeAttachedForPut_ ? GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); mpi::WindowUnlock (window); mpi::WindowFree (window); - attachedForPut_ = false; - attachedForGet_ = false; - attached_ = false; + toBeAttachedForPut_ = false; + toBeAttachedForGet_ = false; + attached_ = false; putVector_.clear(); getVector_.clear(); From cd00d7df3f73167c7acad2467fac7959fcd00d80 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 8 Jul 2014 11:03:48 -0500 Subject: [PATCH 023/110] fixing detach --- src/core/RmaInterface.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 337fcf6c2b..6cd2a9d2db 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -549,7 +549,7 @@ namespace El { void RmaInterface::Detach() { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + if( !attached_ ) LogicError("Must attach before detaching."); const Grid& g = ( toBeAttachedForPut_ ? @@ -563,10 +563,11 @@ namespace El { toBeAttachedForGet_ = false; attached_ = false; + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + putVector_.clear(); getVector_.clear(); - - mpi::Barrier (g.VCComm ()); } template class RmaInterface; From add6b7daf841d91e58254a427c918d4669cdcd73 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 8 Jul 2014 13:54:35 -0500 Subject: [PATCH 024/110] added another flag to track detach, as dtor will call detach again, plus some comments --- include/El/core/RmaInterface.hpp | 3 +- src/core/RmaInterface.cpp | 1137 +++++++++++++++--------------- 2 files changed, 582 insertions(+), 558 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 238adf100e..5c576219b2 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -52,7 +52,8 @@ class RmaInterface std::vector getVector_, putVector_; DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; - bool toBeAttachedForPut_, toBeAttachedForGet_, attached_; + bool toBeAttachedForPut_, toBeAttachedForGet_, + attached_, detached_; }; } // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 6cd2a9d2db..085727a3f3 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -14,567 +14,590 @@ which can be found in the LICENSE file in the root directory, or at */ #include "El-lite.hpp" #include - + // TODO complete the const interfaces... // TODO RMA related checks pending (e.g bounds checking)... #if MPI_VERSION>=3 -namespace El { - - template - RmaInterface::RmaInterface() - : GlobalArrayPut_(0), GlobalArrayGet_(0), - putVector_(0), getVector_(0), window (MPI_WIN_NULL), - toBeAttachedForPut_(false), toBeAttachedForGet_(false), - attached_(false) - { } - - template - RmaInterface::RmaInterface( DistMatrix& Z ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - - toBeAttachedForGet_ = true; - toBeAttachedForPut_ = true; - GlobalArrayPut_ = &Z; - GlobalArrayGet_ = &Z; - window = MPI_WIN_NULL; - - const Int p = Z.Grid ().Size(); - putVector_.resize( p ); - getVector_.resize( p ); - } - - template - RmaInterface::RmaInterface( const DistMatrix& X ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - - toBeAttachedForGet_ = true; - toBeAttachedForPut_ = false; - GlobalArrayGet_ = &X; - GlobalArrayPut_ = 0; - window = MPI_WIN_NULL; - - const Int p = X.Grid ().Size (); - getVector_.resize( p ); - putVector_.resize( p ); - } - - template - RmaInterface::~RmaInterface() - { - { - if( std::uncaught_exception() ) - { - std::ostringstream os; - os << "Uncaught exception detected during RmaInterface destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << std::endl; - std::cerr << os.str(); - DEBUG_ONLY(DumpCallStack()) - } - else - { - Detach(); - } - } - } - - template - void RmaInterface::Attach( DistMatrix& Z ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - if (attached_) - LogicError("Must detach before reattaching."); - else - attached_ = true; - - // if DistMatrix is non-const, all one-sided - // transfers -- put, get and acc are possible - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - { - GlobalArrayPut_ = &Z; - toBeAttachedForPut_ = true; - GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; - } - const Grid& g = Z.Grid(); - // do rma related checks - // extra for headers - const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); - const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); - // TODO C++ way of type casting? - void* baseptr = (void *)Z.Buffer (); - // TODO Use DEBUG_ONLY or something that EL provides - assert(baseptr != NULL); - - mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); - mpi::WindowLock (window); - } - - template - void RmaInterface::Attach( const DistMatrix& X ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - if (attached_) - LogicError("Must detach before reattaching."); - else - attached_ = true; - - if( !toBeAttachedForGet_ && !toBeAttachedForPut_) - { - GlobalArrayGet_ = &X; - toBeAttachedForGet_ = true; - GlobalArrayPut_ = 0; - toBeAttachedForPut_ = false; - } - else - LogicError("Cannot update Global matrix."); - - const Grid& g = X.Grid(); - - //do rma related checks - // extra for headers - const Int numEntries = X.LocalHeight () * X.LocalWidth (); - const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); - - void* baseptr = (void *)X.LockedBuffer (); - assert (baseptr != NULL); - - mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); - mpi::WindowLock (window); - } - - template - void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); - - DistMatrix& Y = *GlobalArrayPut_; - //do rma related checks - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); - - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // put local matrix cells in - // correct places in global array - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step(head) = i; head += sizeof(Int); - *reinterpret_cast(head) = j; head += sizeof(Int); - *reinterpret_cast(head) = height; head += sizeof(Int); - *reinterpret_cast(head) = width; head += sizeof(Int); - *reinterpret_cast(head) = alpha; head += sizeof(T); - - std::cout << "Before packing payload\n"; - // Pack the payload - // consider ddt here - T* sendData = reinterpret_cast(head); - const T* XBuffer = Y.Buffer(); - const Int XLDim = Y.LDim(); - for( Int t=0; t - void RmaInterface::Put( T alpha, const Matrix& Z, Int i, Int j ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - } - - template - void RmaInterface::Get( Matrix& Z, Int i, Int j ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - - const DistMatrix &X = *GlobalArrayGet_; - - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - if (i + height > X.Height () || j + width > X.Width ()) - LogicError("Submatrix out of bounds of global matrix"); - - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - - // get into local matrix cells from - // correct places in global array - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step(head); - head += sizeof (Int); - const Int j = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int height = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int width = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const T alpha = *reinterpret_cast < const T * >(head); - head += sizeof (T); - - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(head); - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); - - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += alpha * XCol[s]; - } - } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; - } - // Free the memory for the get buffer - getVector_.clear(); - } - - // TODO will deal with const interfaces later - template - void RmaInterface::Get( const Matrix& Z, Int i, Int j ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) - } - - // scaled accumulate - template - void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) - - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated."); - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative."); - - DistMatrix& Y = *GlobalArrayPut_; - - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix."); - - //do rma related checks - - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // put local matrix cells in - // correct places in global array - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step(head) = i; head += sizeof(Int); - *reinterpret_cast(head) = j; head += sizeof(Int); - *reinterpret_cast(head) = height; head += sizeof(Int); - *reinterpret_cast(head) = width; head += sizeof(Int); - *reinterpret_cast(head) = alpha; head += sizeof(T); - - // Pack the payload - // consider ddt here - T* sendData = reinterpret_cast(head); - const T* XBuffer = Z.Buffer(); - const Int XLDim = Z.LDim(); - for( Int t=0; t - void RmaInterface::Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) - } - - template - void RmaInterface::Flush( Matrix& Z, Int i, Int j ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - - DistMatrix& Y = *GlobalArrayPut_; - - //do rma related checks - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step - void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - - const DistMatrix& Y = *GlobalArrayGet_; - - //do rma related checks - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step - void RmaInterface::Flush( Matrix& Z ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - - // rma checks, see if Z is not NULL, etc - DistMatrix& Y = *GlobalArrayPut_; - - mpi::Flush (window); - } - - template - void RmaInterface::Flush( const Matrix& Z ) - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - - // rma checks, see if Z is not NULL, etc - const DistMatrix& Y = *GlobalArrayGet_; - - mpi::Flush (window); - } - - template - void RmaInterface::Detach() - { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) - if( !attached_ ) - LogicError("Must attach before detaching."); - - const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : +namespace El +{ + +template +RmaInterface::RmaInterface() + : GlobalArrayPut_(0), GlobalArrayGet_(0), + putVector_(0), getVector_(0), window (MPI_WIN_NULL), + toBeAttachedForPut_(false), toBeAttachedForGet_(false), + attached_(false), detached_(false) +{ } + +template +RmaInterface::RmaInterface( DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) + + attached_ = false; + detached_ = false; + toBeAttachedForGet_ = true; + toBeAttachedForPut_ = true; + GlobalArrayPut_ = &Z; + GlobalArrayGet_ = &Z; + window = MPI_WIN_NULL; + + const Int p = Z.Grid ().Size(); + putVector_.resize( p ); + getVector_.resize( p ); +} + +template +RmaInterface::RmaInterface( const DistMatrix& X ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) + + attached_ = false; + detached_ = false; + toBeAttachedForGet_ = true; + toBeAttachedForPut_ = false; + GlobalArrayGet_ = &X; + GlobalArrayPut_ = 0; + window = MPI_WIN_NULL; + + const Int p = X.Grid ().Size (); + getVector_.resize( p ); + putVector_.resize( p ); +} + +template +RmaInterface::~RmaInterface() +{ + { + if( std::uncaught_exception() ) + { + std::ostringstream os; + os << "Uncaught exception detected during RmaInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY(DumpCallStack()) + } + else + { + Detach(); + } + } +} + +template +void RmaInterface::Attach( DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) + // attached_ will be only set in Attach + // and only unset in Detach + if (!attached_) + attached_ = true; + else + LogicError("Must detach before reattaching."); + + // if DistMatrix is non-const, all one-sided + // transfers -- put, get and acc are possible + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + { + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + } + const Grid& g = Z.Grid(); + // do rma related checks + // extra for headers + const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); + const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + // TODO C++ way of type casting? + void* baseptr = (void *)Z.Buffer (); + // TODO Use DEBUG_ONLY or something that EL provides + assert(baseptr != NULL); + + mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); + mpi::WindowLock (window); +} + +template +void RmaInterface::Attach( const DistMatrix& X ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) + if (!attached_) + attached_ = true; + else + LogicError("Must detach before reattaching."); + + if( !toBeAttachedForGet_ && !toBeAttachedForPut_) + { + GlobalArrayGet_ = &X; + toBeAttachedForGet_ = true; + GlobalArrayPut_ = 0; + toBeAttachedForPut_ = false; + } + else + LogicError("Cannot update Global matrix."); + + const Grid& g = X.Grid(); + + //do rma related checks + // extra for headers + const Int numEntries = X.LocalHeight () * X.LocalWidth (); + const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + + void* baseptr = (void *)X.LockedBuffer (); + assert (baseptr != NULL); + + mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); + mpi::WindowLock (window); +} + +template +void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // put local matrix cells in + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step(head) = i; + head += sizeof(Int); + *reinterpret_cast(head) = j; + head += sizeof(Int); + *reinterpret_cast(head) = height; + head += sizeof(Int); + *reinterpret_cast(head) = width; + head += sizeof(Int); + *reinterpret_cast(head) = alpha; + head += sizeof(T); + + std::cout << "Before packing payload\n"; + // Pack the payload + // consider ddt here + T* sendData = reinterpret_cast(head); + const T* XBuffer = Y.Buffer(); + const Int XLDim = Y.LDim(); + for( Int t=0; t +void RmaInterface::Put( T alpha, const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) +} + +template +void RmaInterface::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + + const DistMatrix &X = *GlobalArrayGet_; + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError("Submatrix out of bounds of global matrix"); + + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; + + // get into local matrix cells from + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step(head); + head += sizeof (Int); + const Int j = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int height = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int width = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const T alpha = *reinterpret_cast < const T * >(head); + head += sizeof (T); + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(head); + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += alpha * XCol[s]; + } + } + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } + // Free the memory for the get buffer + getVector_.clear(); +} + +// TODO will deal with const interfaces later +template +void RmaInterface::Get( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) +} + +// scaled accumulate +template +void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated."); + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative."); + + DistMatrix& Y = *GlobalArrayPut_; + + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix."); + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // put local matrix cells in + // correct places in global array + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step(head) = i; + head += sizeof(Int); + *reinterpret_cast(head) = j; + head += sizeof(Int); + *reinterpret_cast(head) = height; + head += sizeof(Int); + *reinterpret_cast(head) = width; + head += sizeof(Int); + *reinterpret_cast(head) = alpha; + head += sizeof(T); + + // Pack the payload + // consider ddt here + T* sendData = reinterpret_cast(head); + const T* XBuffer = Z.Buffer(); + const Int XLDim = Z.LDim(); + for( Int t=0; t +void RmaInterface::Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) +} + +template +void RmaInterface::Flush( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + DistMatrix& Y = *GlobalArrayPut_; + + //do rma related checks + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step +void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + const DistMatrix& Y = *GlobalArrayGet_; + + //do rma related checks + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step +void RmaInterface::Flush( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + // rma checks, see if Z is not NULL, etc + DistMatrix& Y = *GlobalArrayPut_; + + mpi::Flush (window); +} + +template +void RmaInterface::Flush( const Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + // rma checks, see if Z is not NULL, etc + const DistMatrix& Y = *GlobalArrayGet_; + + mpi::Flush (window); +} + +template +void RmaInterface::Detach() +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) + // destructor will call detach again... + if (detached_) + return; + if( !attached_ ) + LogicError("Must attach before detaching."); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); - - mpi::WindowUnlock (window); - mpi::WindowFree (window); - - toBeAttachedForPut_ = false; - toBeAttachedForGet_ = false; - attached_ = false; - - GlobalArrayPut_ = 0; - GlobalArrayGet_ = 0; - - putVector_.clear(); - getVector_.clear(); - } - - template class RmaInterface; - template class RmaInterface; - template class RmaInterface; - template class RmaInterface>; - template class RmaInterface>; + + mpi::Barrier( g.VCComm() ); + + attached_ = false; + detached_ = true; + toBeAttachedForPut_ = false; + toBeAttachedForGet_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + + putVector_.clear(); + getVector_.clear(); + + mpi::WindowUnlock (window); + mpi::WindowFree (window); +} + +template class RmaInterface; +template class RmaInterface; +template class RmaInterface; +template class RmaInterface>; +template class RmaInterface>; } // namespace El #endif From d03441fcb0b441ff390111d6ad3877e3833ce974 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 8 Jul 2014 14:37:54 -0500 Subject: [PATCH 025/110] temporarily reverting back to something that works...the idea is, when nbc is turned on, we could bypass everything related to handleoms and save memory --- src/core/AxpyInterface.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index d3652f2d97..1b19f31ebf 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -517,20 +517,20 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) // TODO the size of request object is set in this function // bypassing it means passing same request handle multiple // times, we don't care about it in NbC version though(?) -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +//#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) //const Int numCreated = dataVectors_[destination].size(); - const Int index = 0;//numCreated; +// const Int index = 0;//numCreated; //dataVectors_[destination].resize (numCreated + 1); //dataVectors_[numCreated].resize (bufferSize); - dataVectors_[0].resize (bufferSize); +// dataVectors_[0].resize (bufferSize); //dataSendRequests_[destination].push_back (mpi::REQUEST_NULL); //sendingData_[destination].push_back (true); -#else +//#else const Int index = ReadyForSend (bufferSize, dataVectors_[destination], dataSendRequests_[destination], sendingData_[destination]); -#endif +//#endif DEBUG_ONLY (if (Int (dataVectors_[destination][index].size ()) != bufferSize) LogicError ("Error in ReadyForSend");) From 2a914b27651323bf027e47437e81a4d3cf0ce35d Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 9 Jul 2014 17:30:53 -0500 Subject: [PATCH 026/110] added rmainterface test in line with axpyinterface, removed all header packing-unpacking logic from rmainterface --- include/El/core/imports/mpi.hpp | 4 +- src/core/RmaInterface.cpp | 86 ++++++++--------------------- tests/core/RmaInterface.cpp | 97 +++++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 65 deletions(-) create mode 100644 tests/core/RmaInterface.cpp diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 89ac9de942..354ddf636d 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -220,8 +220,8 @@ void Iget( void *source, int source_size, int target_rank, int target_size, Window& window); void Rget( void *source, int source_size, int target_rank, int target_size, Window& window, Request& request); -void Iacc( void *source, int source_size, int target_rank, - int target_size, Op &op, Window& window); +void Iacc( void *source, int source_size, int target_rank, + int target_size, Op & op, Window & window); void Racc( void *source, int source_size, int target_rank, int target_size, Op &op, Window& window, Request& request); // Synchronization diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 085727a3f3..077096b2e1 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -112,7 +112,8 @@ void RmaInterface::Attach( DistMatrix& Z ) // do rma related checks // extra for headers const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); - const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + // why numEntries+1? + const Int bufferSize = (numEntries+1)*sizeof(T); // TODO C++ way of type casting? void* baseptr = (void *)Z.Buffer (); // TODO Use DEBUG_ONLY or something that EL provides @@ -146,7 +147,7 @@ void RmaInterface::Attach( const DistMatrix& X ) //do rma related checks // extra for headers const Int numEntries = X.LocalHeight () * X.LocalWidth (); - const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + const Int bufferSize = (numEntries+1)*sizeof(T); void* baseptr = (void *)X.LockedBuffer (); assert (baseptr != NULL); @@ -198,29 +199,15 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + const Int bufferSize = (numEntries+1)*sizeof(T); // Pack the header // make variable names rma friendly putVector_.resize( bufferSize ); byte* sendBuffer = putVector_.data(); - byte* head = sendBuffer; - std::cout << "After pointing head to sendbuffer\n"; - *reinterpret_cast(head) = i; - head += sizeof(Int); - *reinterpret_cast(head) = j; - head += sizeof(Int); - *reinterpret_cast(head) = height; - head += sizeof(Int); - *reinterpret_cast(head) = width; - head += sizeof(Int); - *reinterpret_cast(head) = alpha; - head += sizeof(T); - - std::cout << "Before packing payload\n"; // Pack the payload // consider ddt here - T* sendData = reinterpret_cast(head); + T* sendData = reinterpret_cast(sendBuffer); const T* XBuffer = Y.Buffer(); const Int XLDim = Y.LDim(); for( Int t=0; t::Put( T alpha, Matrix& Z, Int i, Int j ) for( Int s=0; s::Get( Matrix& Z, Int i, Int j ) if ( !toBeAttachedForGet_ ) LogicError ("Cannot perform this operation as matrix is not attached."); - const DistMatrix &X = *GlobalArrayGet_; + const DistMatrix &X = *GlobalArrayGet_; const Grid & g = X.Grid (); const Int r = g.Height (); @@ -277,10 +265,9 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) const Int colAlign = (X.ColAlign() + i) % r; const Int rowAlign = (X.RowAlign() + j) % c; - // get into local matrix cells from - // correct places in global array Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; + for( Int step=0; step::Get( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + const Int bufferSize = (numEntries+1)*sizeof(T); getVector_.resize (bufferSize); byte *getBuffer = getVector_.data (); mpi::Iget (getBuffer, bufferSize, destination, bufferSize, window); //do we need flush here? - - // Extract the header - byte *head = getBuffer; - const Int i = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int j = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int height = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int width = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const T alpha = *reinterpret_cast < const T * >(head); - head += sizeof (T); - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(head); + const T *XBuffer = reinterpret_cast(getBuffer); const Int colAlign = (X.ColAlign () + i) % r; const Int rowAlign = (X.RowAlign () + j) % c; const Int colShift = Shift (myRow, colAlign, r); @@ -330,7 +303,7 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); const T *XCol = &XBuffer[t * localHeight]; for (Int s = 0; s < localHeight; ++s) - YCol[s] += alpha * XCol[s]; + YCol[s] = XCol[s]; } } receivingRow = (receivingRow + 1) % r; @@ -348,7 +321,7 @@ void RmaInterface::Get( const Matrix& Z, Int i, Int j ) DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) } -// scaled accumulate +// scaled accumulate - dst = dst + alpha*src template void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) { @@ -391,43 +364,30 @@ void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight*localWidth; - if( numEntries != 0 ) + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int bufferSize = 4*sizeof(Int) + (numEntries+1)*sizeof(T); + const Int bufferSize = (numEntries+1)*sizeof(T); // Pack the header // make variable names rma friendly putVector_.resize( bufferSize ); byte* sendBuffer = putVector_.data(); - - byte* head = sendBuffer; - *reinterpret_cast(head) = i; - head += sizeof(Int); - *reinterpret_cast(head) = j; - head += sizeof(Int); - *reinterpret_cast(head) = height; - head += sizeof(Int); - *reinterpret_cast(head) = width; - head += sizeof(Int); - *reinterpret_cast(head) = alpha; - head += sizeof(T); - - // Pack the payload - // consider ddt here - T* sendData = reinterpret_cast(head); - const T* XBuffer = Z.Buffer(); + + // consider ddt here + T* sendData = reinterpret_cast(sendBuffer); + T* XBuffer = Z.Buffer(); const Int XLDim = Z.LDim(); - for( Int t=0; t + int +main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + + assert (AXPY_DIM < DIM); + + try + { + Grid grid (comm); + + // Create an 8 x 8 distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + RmaInterface < double > Rmaint; + Rmaint.Attach (A); + + // If we are process 0, then create a 3 x 3 identity matrix, B, + // and Axpy it into the bottom-right of A (using alpha=2) + // NOTE: The bottom-right 3 x 3 submatrix starts at the (5,5) + // entry of A. + // NOTE: Every process is free to Axpy as many submatrices as they + // desire at this point. + if (grid.VCRank () == 0) + { + mpi::Op op; + op.op = MPI_SUM; + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY is scaled accumulate as in ARMCI + Rmaint.Acc (ALPHA, B, op, (DIM - AXPY_DIM), (DIM - AXPY_DIM)); + Rmaint.Flush (B, (DIM - AXPY_DIM), (DIM - AXPY_DIM)); + } + if (DIM <= 20) + Print (A, "Updated distributed A"); + // Have process 0 request a copy of the entire distributed matrix + // + // NOTE: Every process is free to Axpy as many submatrices as they + // desire at this point. + Matrix < double >C; + if (grid.VCRank () == 0) + { + Zeros (C, DIM, DIM); + Rmaint.Get (C, 0, 0); + Rmaint.Flush (C); + } + + // Process 0 can now locally print its copy of A + if (grid.VCRank () == 0 && DIM <= 20) + Print (C, "Process 0's local copy of A"); + // Collectively detach in order to finish filling process 0's request + Rmaint.Detach (); + } + } + catch (std::exception & e) + { + ReportException (e); + } + + Finalize (); + return 0; +} From a759aba878c1d87efbcc06359c7e4a3f3272bdfc Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Fri, 11 Jul 2014 15:18:25 -0500 Subject: [PATCH 027/110] called flush local after p/g/a, modified put-getvector types and initialization...intermediate commit, acc still does not work because of incorrect indices and window disp --- include/El/core/RmaInterface.hpp | 7 +- src/core/RmaInterface.cpp | 113 +++++++++++++++++-------------- src/core/imports/mpi.cpp | 6 ++ 3 files changed, 75 insertions(+), 51 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 5c576219b2..3a003ddb40 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -49,7 +49,12 @@ class RmaInterface private: mpi::Window window; - std::vector getVector_, putVector_; + + //std::vector>> + // getVector_, putVector_; + std::vector> + getVector_, putVector_; + DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; bool toBeAttachedForPut_, toBeAttachedForGet_, diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 077096b2e1..6aa71b9e53 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -15,8 +15,12 @@ which can be found in the LICENSE file in the root directory, or at #include "El-lite.hpp" #include -// TODO complete the const interfaces... +// TODO Complete the const interfaces... // TODO RMA related checks pending (e.g bounds checking)... +// TODO Consider DDT +// TODO Add disp as a parameter to MPI one sided functions? +// TODO Use DEBUG_ONLY or something that EL provides instead of assert +// TODO Make variable names rma friendly #if MPI_VERSION>=3 namespace El { @@ -106,17 +110,21 @@ void RmaInterface::Attach( DistMatrix& Z ) GlobalArrayPut_ = &Z; toBeAttachedForPut_ = true; GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; + toBeAttachedForGet_ = true; } const Grid& g = Z.Grid(); + const Int p = g.Size (); + + if (putVector_.size() != p) + { + getVector_.resize( p ); + putVector_.resize( p ); + } + // do rma related checks - // extra for headers const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); - // why numEntries+1? - const Int bufferSize = (numEntries+1)*sizeof(T); - // TODO C++ way of type casting? - void* baseptr = (void *)Z.Buffer (); - // TODO Use DEBUG_ONLY or something that EL provides + const Int bufferSize = numEntries * sizeof(T); + void* baseptr = reinterpret_cast(Z.Buffer ()); assert(baseptr != NULL); mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); @@ -143,13 +151,18 @@ void RmaInterface::Attach( const DistMatrix& X ) LogicError("Cannot update Global matrix."); const Grid& g = X.Grid(); + const Int p = g.Size (); + + if (putVector_.size() != p) + { + getVector_.resize( p ); + putVector_.resize( p ); + } //do rma related checks - // extra for headers const Int numEntries = X.LocalHeight () * X.LocalWidth (); - const Int bufferSize = (numEntries+1)*sizeof(T); - - void* baseptr = (void *)X.LockedBuffer (); + const Int bufferSize = numEntries * sizeof(T); + void* baseptr = (void*)X.LockedBuffer (); assert (baseptr != NULL); mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); @@ -194,19 +207,17 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) const Int rowShift = Shift( receivingCol, rowAlign, c ); const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); - const Int numEntries = localHeight*localWidth; + const Int numEntries = localHeight * localWidth; if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int bufferSize = (numEntries+1)*sizeof(T); + const Int bufferSize = numEntries*sizeof(T); // Pack the header - // make variable names rma friendly - putVector_.resize( bufferSize ); - byte* sendBuffer = putVector_.data(); + putVector_[destination].resize( bufferSize ); + byte* sendBuffer = putVector_[destination].data(); // Pack the payload - // consider ddt here T* sendData = reinterpret_cast(sendBuffer); const T* XBuffer = Y.Buffer(); const Int XLDim = Y.LDim(); @@ -217,15 +228,14 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) for( Int s=0; s @@ -274,15 +284,15 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) const Int rowShift = Shift( receivingCol, rowAlign, c ); const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); - const Int numEntries = localHeight*localWidth; + const Int numEntries = localHeight * localWidth; if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int bufferSize = (numEntries+1)*sizeof(T); + const Int bufferSize = numEntries * sizeof(T); - getVector_.resize (bufferSize); - byte *getBuffer = getVector_.data (); + getVector_[destination].resize (bufferSize); + byte *getBuffer = getVector_[destination].data (); mpi::Iget (getBuffer, bufferSize, destination, bufferSize, window); //do we need flush here? @@ -305,13 +315,13 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) for (Int s = 0; s < localHeight; ++s) YCol[s] = XCol[s]; } + // clear + getVector_[destination].resize (0); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - // Free the memory for the get buffer - getVector_.clear(); } // TODO will deal with const interfaces later @@ -348,52 +358,55 @@ void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + // global matrix width and height + const Int height = Y.Height(); + const Int width = Y.Width(); - // put local matrix cells in - // correct places in global array Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; + for( Int step=0; step(sendBuffer); - T* XBuffer = Z.Buffer(); + putVector_[destination].resize( bufferSize ); + byte* sendBuffer = putVector_[destination].data(); + T* sendData = reinterpret_cast(sendBuffer); + const T* XBuffer = Z.LockedBuffer(); const Int XLDim = Z.LDim(); // src*alpha - for( Int t=0; t diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 5307bc7868..b8c9a386a4 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -515,6 +515,7 @@ void Iput (void *source, int source_size, int target_rank, target_rank, 1, (MPI_Aint) target_size, MPI_BYTE, window)); #endif + SafeMpi (MPI_Win_flush_local (target_rank, window)); } void Rput (void *source, int source_size, int target_rank, @@ -533,6 +534,7 @@ void Rput (void *source, int source_size, int target_rank, target_rank, 1, (MPI_Aint) target_size, MPI_BYTE, window, &request)); #endif + SafeMpi (MPI_Win_flush_local (target_rank, window)); } void Iget (void *source, int source_size, int target_rank, @@ -551,6 +553,7 @@ void Iget (void *source, int source_size, int target_rank, target_rank, 1, (MPI_Aint) target_size, MPI_BYTE, window)); #endif + SafeMpi (MPI_Win_flush_local (target_rank, window)); } void Rget (void *source, int source_size, int target_rank, @@ -570,6 +573,7 @@ void Rget (void *source, int source_size, int target_rank, target_rank, 1, (MPI_Aint) target_size, MPI_BYTE, window, &request)); #endif + SafeMpi (MPI_Win_flush_local (target_rank, window)); } void Iacc (void *source, int source_size, int target_rank, @@ -581,6 +585,7 @@ void Iacc (void *source, int source_size, int target_rank, MPI_BYTE, target_rank, 1, (MPI_Aint) target_size, MPI_BYTE, op.op, window)); + SafeMpi (MPI_Win_flush_local (target_rank, window)); } void Racc (void *source, int source_size, int target_rank, @@ -593,6 +598,7 @@ void Racc (void *source, int source_size, int target_rank, MPI_BYTE, target_rank, 1, (MPI_Aint) target_size, MPI_BYTE, op.op, window, &request)); + SafeMpi (MPI_Win_flush_local (target_rank, window)); } void Flush (int target_rank, Window & window) From 7c169ed8089794b9a8b38a28ab9455854252e1f4 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Fri, 11 Jul 2014 18:33:05 -0500 Subject: [PATCH 028/110] added disp in mpi functions --- include/El/core/imports/mpi.hpp | 21 ++++++++++++--------- src/core/RmaInterface.cpp | 17 +++++++---------- src/core/imports/mpi.cpp | 32 ++++++++++++++++---------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 354ddf636d..def98c2e7a 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -213,17 +213,20 @@ void WindowCreate( void* baseptr, int size, Comm comm, Window& window ); void WindowFree (Window & window); // One-sided operations void Iput( void *source, int source_size, int target_rank, - int target_size, Window& window); -void Rput( void *source, int source_size, int target_rank, int target_size, - Window& window, Request& request); + Aint disp, int target_size, Window& window); +void Rput( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window& window, + Request& request); void Iget( void *source, int source_size, int target_rank, - int target_size, Window& window); -void Rget( void *source, int source_size, int target_rank, int target_size, - Window& window, Request& request); + Aint disp, int target_size, Window& window); +void Rget( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window& window, + Request& request); void Iacc( void *source, int source_size, int target_rank, - int target_size, Op & op, Window & window); -void Racc( void *source, int source_size, int target_rank, int target_size, - Op &op, Window& window, Request& request); + Aint disp, int target_size, Op & op, Window & window); +void Racc( void *source, int source_size, int target_rank, + Aint disp, int target_size, Op &op, Window& window, + Request& request); // Synchronization void Flush( int target_rank, Window& window ); void Flush (Window & window); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 6aa71b9e53..c6ebe241db 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -228,7 +228,8 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) for( Int s=0; s::Get( Matrix& Z, Int i, Int j ) getVector_[destination].resize (bufferSize); byte *getBuffer = getVector_[destination].data (); - mpi::Iget (getBuffer, bufferSize, destination, bufferSize, window); + mpi::Aint disp = X.LDim () * sizeof(T); + mpi::Iget (getBuffer, bufferSize, destination, disp, bufferSize, window); //do we need flush here? // Update Y const T *XBuffer = reinterpret_cast(getBuffer); @@ -392,14 +394,9 @@ void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) for( Int s=0; s Date: Tue, 15 Jul 2014 23:19:20 -0500 Subject: [PATCH 029/110] figured out the displacement, Acc contains correct displacement, modify others following the same --- src/core/RmaInterface.cpp | 86 ++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index c6ebe241db..4443a0ab19 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -129,6 +129,7 @@ void RmaInterface::Attach( DistMatrix& Z ) mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); + mpi::Barrier (g.VCComm ()); } template @@ -167,6 +168,7 @@ void RmaInterface::Attach( const DistMatrix& X ) mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); + mpi::Barrier (g.VCComm ()); } template @@ -333,7 +335,8 @@ void RmaInterface::Get( const Matrix& Z, Int i, Int j ) DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) } -// scaled accumulate - dst = dst + alpha*src +// scaled accumulate = Update Y(i:i+height-1,j:j+width-1) += alpha X, +// where X is height x width template void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) { @@ -359,50 +362,57 @@ void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - - // global matrix width and height - const Int height = Y.Height(); - const Int width = Y.Width(); + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; for( Int step=0; step(sendBuffer); - const T* XBuffer = Z.LockedBuffer(); - const Int XLDim = Z.LDim(); - // src*alpha - for( Int t=0; t(sendBuffer); + const T* XBuffer = Z.LockedBuffer(); + + for( Int t=0; t(sendBuffer + t*localHeight*sizeof(T)), localHeight * sizeof(T), + destination, disp, localHeight * sizeof(T), window); + } + // clear + putVector_[destination].resize (0); } receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; } } @@ -432,8 +442,8 @@ void RmaInterface::Flush( Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + const Int height = Y.Height(); + const Int width = Y.Width(); // find destination Int receivingRow = myProcessRow; @@ -478,8 +488,8 @@ void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + const Int height = Y.Height(); + const Int width = Y.Width(); // find destination Int receivingRow = myProcessRow; From 194298e014343aa692a9323eaeb7462d3e7a1648 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 16 Jul 2014 09:39:35 -0500 Subject: [PATCH 030/110] addding interface with no local flush, IMO this is temporary, for some testing --- include/El/core/imports/mpi.hpp | 22 +++++- src/core/imports/mpi.cpp | 120 ++++++++++++++++++++++++++++++-- 2 files changed, 135 insertions(+), 7 deletions(-) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index def98c2e7a..c769ae38ad 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -212,6 +212,7 @@ void WindowUnlock( Window& window ); void WindowCreate( void* baseptr, int size, Comm comm, Window& window ); void WindowFree (Window & window); // One-sided operations +// local flush present void Iput( void *source, int source_size, int target_rank, Aint disp, int target_size, Window& window); void Rput( void *source, int source_size, int target_rank, @@ -227,9 +228,28 @@ void Iacc( void *source, int source_size, int target_rank, void Racc( void *source, int source_size, int target_rank, Aint disp, int target_size, Op &op, Window& window, Request& request); +// no local flush +void Iput_nolocalflush( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window& window); +void Rput_nolocalflush( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window& window, + Request& request); +void Iget_nolocalflush( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window& window); +void Rget_nolocalflush( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window& window, + Request& request); +void Iacc_nolocalflush( void *source, int source_size, int target_rank, + Aint disp, int target_size, Op & op, Window & window); +void Racc_nolocalflush( void *source, int source_size, int target_rank, + Aint disp, int target_size, Op &op, Window& window, + Request& request); + // Synchronization void Flush( int target_rank, Window& window ); -void Flush (Window & window); +void Flush(Window & window); +void FlushLocal( int target_rank, Window& window ); +void FlushLocal(Window & window); #endif // Utilities diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 4c0040017a..e4a5143cd7 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -500,6 +500,113 @@ void WindowFree (Window & window) SafeMpi (MPI_Win_free (&window)); } +void Iput_nolocalflush (void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iput")) +#ifdef EL_ENSURE_PUT_ATOMICITY + SafeMpi (MPI_Accumulate + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, disp, (MPI_Aint) target_size, + MPI_BYTE, MPI_REPLACE, window)); +#else + SafeMpi (MPI_Put + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, disp, (MPI_Aint) target_size, + MPI_BYTE, window)); +#endif +} + +void Rput_nolocalflush (void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) +#ifdef EL_ENSURE_PUT_ATOMICITY + SafeMpi (MPI_Raccumulate + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, disp, (MPI_Aint) target_size, + MPI_BYTE, MPI_REPLACE, window, &request)); +#else + SafeMpi (MPI_Rput + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, disp, (MPI_Aint) target_size, + MPI_BYTE, window, &request)); +#endif +} + +void Iget_nolocalflush (void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) +#ifdef EL_ENSURE_GET_ATOMICITY + SafeMpi (MPI_Get_accumulate + (NULL, 0, MPI_BYTE, source, + (MPI_Aint) source_size, MPI_BYTE, + target_rank, disp, (MPI_Aint) target_size, + MPI_BYTE, MPI_NO_OP, window)); +#else + SafeMpi (MPI_Get + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, disp, (MPI_Aint) target_size, + MPI_BYTE, window)); +#endif +} + +void Rget_nolocalflush (void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) +#ifdef EL_ENSURE_GET_ATOMICITY + SafeMpi (MPI_Rget_accumulate + (NULL, 0, MPI_BYTE, source, + (MPI_Aint) source_size, MPI_BYTE, + target_rank, disp, (MPI_Aint) target_size, + MPI_BYTE, MPI_NO_OP, window, &request)); +#else + SafeMpi (MPI_Rget + (source, (MPI_Aint) source_size, MPI_BYTE, + target_rank, disp, (MPI_Aint) target_size, + MPI_BYTE, window, &request)); +#endif +} + +void Iacc_nolocalflush (void *source, int source_size, int target_rank, + Aint disp, int target_size, Op & op, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) + SafeMpi (MPI_Accumulate + (source, (MPI_Aint) source_size, + MPI_BYTE, target_rank, disp, + (MPI_Aint) target_size, MPI_BYTE, op.op, + window)); +} + +void Racc_nolocalflush (void *source, int source_size, int target_rank, + Aint disp, int target_size, Op & op, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) + SafeMpi (MPI_Raccumulate + (source, (MPI_Aint) source_size, + MPI_BYTE, target_rank, disp, + (MPI_Aint) target_size, MPI_BYTE, op.op, + window, &request)); +} + +void Flush (int target_rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) + SafeMpi (MPI_Win_flush (target_rank, window)); +} + +void Flush (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) + SafeMpi (MPI_Win_flush_all (window)); +} + void Iput (void *source, int source_size, int target_rank, Aint disp, int target_size, Window & window) { @@ -601,17 +708,18 @@ void Racc (void *source, int source_size, int target_rank, SafeMpi (MPI_Win_flush_local (target_rank, window)); } -void Flush (int target_rank, Window & window) +void FlushLocal (int target_rank, Window & window) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) - SafeMpi (MPI_Win_flush (target_rank, window)); + DEBUG_ONLY (CallStackEntry cse ("mpi::FlushLocal")) + SafeMpi (MPI_Win_flush_local (target_rank, window)); } -void Flush (Window & window) +void FlushLocal (Window & window) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) - SafeMpi (MPI_Win_flush_all (window)); + DEBUG_ONLY (CallStackEntry cse ("mpi::FlushLocal")) + SafeMpi (MPI_Win_flush_local_all (window)); } + #endif // Various utilities From 8cbc72fad9d0b02843d2ef136145453e501c5903 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 16 Jul 2014 11:36:12 -0500 Subject: [PATCH 031/110] fixed acc and flush...next is get --- src/core/RmaInterface.cpp | 183 +++++++++++++++++++++++++++----------- 1 file changed, 129 insertions(+), 54 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 4443a0ab19..c7bc88b737 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -129,7 +129,6 @@ void RmaInterface::Attach( DistMatrix& Z ) mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); - mpi::Barrier (g.VCComm ()); } template @@ -168,7 +167,6 @@ void RmaInterface::Attach( const DistMatrix& X ) mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); - mpi::Barrier (g.VCComm ()); } template @@ -176,15 +174,15 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; //do rma related checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError("Submatrix out of bounds of global matrix"); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -195,49 +193,55 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - // local width and height + const Int XLDim = Z.LDim(); + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - // put local matrix cells in - // correct places in global array Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; + + const Int iLocalOffset = Length( i, Y.ColShift (), r ); + const Int jLocalOffset = Length( j, Y.RowShift (), c ); + for( Int step=0; step(sendBuffer); - const T* XBuffer = Y.Buffer(); - const Int XLDim = Y.LDim(); - for( Int t=0; t(sendBuffer); + const T* XBuffer = Z.LockedBuffer(); + + for( Int t=0; t(sendBuffer + t*localHeight*sizeof(T)), + localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); + } + // local flush, okay to clear buffers after this + mpi::FlushLocal (destination, window); + // clear putVector_[destination].resize (0); } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; } } @@ -398,15 +402,13 @@ void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) const T* thisXCol = &XBuffer[(rowShift+t*c)*XLDim]; for( Int s=0; s(sendBuffer + t*localHeight*sizeof(T)), localHeight * sizeof(T), - destination, disp, localHeight * sizeof(T), window); + mpi::Iacc_nolocalflush (reinterpret_cast(sendBuffer + t*localHeight*sizeof(T)), + localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), op, window); } + // local flush, okay to clear buffers after this + mpi::FlushLocal (destination, window); // clear putVector_[destination].resize (0); } @@ -442,8 +444,8 @@ void RmaInterface::Flush( Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; // local width and height - const Int height = Y.Height(); - const Int width = Y.Width(); + const Int height = Z.Height(); + const Int width = Z.Width(); // find destination Int receivingRow = myProcessRow; @@ -488,8 +490,8 @@ void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; // local width and height - const Int height = Y.Height(); - const Int width = Y.Width(); + const Int height = Z.Height(); + const Int width = Z.Width(); // find destination Int receivingRow = myProcessRow; @@ -514,18 +516,56 @@ void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) } } +// Are these only useful when the user wants to +// get/put the entire DistMatrix to it's local +// PE/everyone in world ? template void RmaInterface::Flush( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - // rma checks, see if Z is not NULL, etc DistMatrix& Y = *GlobalArrayPut_; + + //do rma related checks + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + // i = j = 0 - leftmost coordinates of DistMatrix + const Int colAlign = Y.ColAlign() % r; + const Int rowAlign = Y.RowAlign() % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); - mpi::Flush (window); + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step @@ -539,7 +579,42 @@ void RmaInterface::Flush( const Matrix& Z ) // rma checks, see if Z is not NULL, etc const DistMatrix& Y = *GlobalArrayGet_; - mpi::Flush (window); + //do rma related checks + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + // i = j = 0 - leftmost coordinates of DistMatrix + const Int colAlign = Y.ColAlign() % r; + const Int rowAlign = Y.RowAlign() % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step From 77918b2c31ff0c1a0ed94ce83012bb500fa24590 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 16 Jul 2014 12:59:44 -0500 Subject: [PATCH 032/110] trying to fix get, tracked some problem in acc, will fix it soon --- src/core/RmaInterface.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index c7bc88b737..359e016bd2 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -300,12 +300,8 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) getVector_[destination].resize (bufferSize); byte *getBuffer = getVector_[destination].data (); - - mpi::Aint disp = X.LDim () * sizeof(T); - mpi::Iget (getBuffer, bufferSize, destination, disp, bufferSize, window); - //do we need flush here? - // Update Y - const T *XBuffer = reinterpret_cast(getBuffer); + + const T *XBuffer = reinterpret_cast(getBuffer); const Int colAlign = (X.ColAlign () + i) % r; const Int rowAlign = (X.RowAlign () + j) % c; const Int colShift = Shift (myRow, colAlign, r); @@ -316,13 +312,21 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) const Int iLocalOffset = Length (i, X.ColShift (), r); const Int jLocalOffset = Length (j, X.RowShift (), c); - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); + for( Int t=0; t(getBuffer + t*localHeight*sizeof(T)), + localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); + // update local matrix + T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); const T *XCol = &XBuffer[t * localHeight]; for (Int s = 0; s < localHeight; ++s) YCol[s] = XCol[s]; - } + } + // no difference between localflush + // and flush for Get + mpi::FlushLocal (destination, window); // clear getVector_[destination].resize (0); } @@ -332,7 +336,6 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) } } -// TODO will deal with const interfaces later template void RmaInterface::Get( const Matrix& Z, Int i, Int j ) { From 51efcaeba477511dabcca2bd5750b41238fc2b6a Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 16 Jul 2014 15:04:08 -0500 Subject: [PATCH 033/110] removed op from acc in rmainterface as well as mpi, mpi has overloaded functions for op...btw acc is still not working, trying to fix it now --- include/El/core/RmaInterface.hpp | 4 ++-- include/El/core/imports/mpi.hpp | 21 ++++++++++++++++----- src/core/RmaInterface.cpp | 6 +++--- src/core/imports/mpi.cpp | 31 +++++++++++++++++++++++++++---- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 3a003ddb40..f958e74631 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -37,8 +37,8 @@ class RmaInterface void Get( Matrix& Z, Int i, Int j ); void Get( const Matrix& Z, Int i, Int j ); - void Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ); - void Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ); + void Acc( T alpha, Matrix& Z, Int i, Int j ); + void Acc( T alpha, const Matrix& Z, Int i, Int j ); void Flush( Matrix& Z, Int i, Int j ); void Flush( const Matrix& Z, Int i, Int j ); diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index c769ae38ad..b292fc0eb0 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -224,9 +224,15 @@ void Rget( void *source, int source_size, int target_rank, Aint disp, int target_size, Window& window, Request& request); void Iacc( void *source, int source_size, int target_rank, - Aint disp, int target_size, Op & op, Window & window); + Aint disp, int target_size, Op op, Window & window); void Racc( void *source, int source_size, int target_rank, - Aint disp, int target_size, Op &op, Window& window, + Aint disp, int target_size, Op op, Window& window, + Request& request); +// use mpi::SUM +void Iacc( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window); +void Racc( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window& window, Request& request); // no local flush void Iput_nolocalflush( void *source, int source_size, int target_rank, @@ -240,11 +246,16 @@ void Rget_nolocalflush( void *source, int source_size, int target_rank, Aint disp, int target_size, Window& window, Request& request); void Iacc_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Op & op, Window & window); + Aint disp, int target_size, Op op, Window & window); void Racc_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Op &op, Window& window, + Aint disp, int target_size, Op op, Window& window, + Request& request); +// use mpi::SUM +void Iacc_nolocalflush( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window); +void Racc_nolocalflush( void *source, int source_size, int target_rank, + Aint disp, int target_size, Window& window, Request& request); - // Synchronization void Flush( int target_rank, Window& window ); void Flush(Window & window); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 359e016bd2..3a72641b75 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -345,7 +345,7 @@ void RmaInterface::Get( const Matrix& Z, Int i, Int j ) // scaled accumulate = Update Y(i:i+height-1,j:j+width-1) += alpha X, // where X is height x width template -void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) +void RmaInterface::Acc( T alpha, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) @@ -408,7 +408,7 @@ void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) // acc mpi::Aint disp = (iLocalOffset + (jLocalOffset+t) * Y.LDim ()) * sizeof(T); mpi::Iacc_nolocalflush (reinterpret_cast(sendBuffer + t*localHeight*sizeof(T)), - localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), op, window); + localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); } // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); @@ -422,7 +422,7 @@ void RmaInterface::Acc( T alpha, Matrix& Z, mpi::Op &op, Int i, Int j ) } template -void RmaInterface::Acc( T alpha, const Matrix& Z, mpi::Op &op, Int i, Int j ) +void RmaInterface::Acc( T alpha, const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) } diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index e4a5143cd7..ce51de1115 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -571,9 +571,32 @@ void Rget_nolocalflush (void *source, int source_size, int target_rank, MPI_BYTE, window, &request)); #endif } +// use mpi::SUM by default +void Iacc_nolocalflush (void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) + SafeMpi (MPI_Accumulate + (source, (MPI_Aint) source_size, + MPI_BYTE, target_rank, disp, + (MPI_Aint) target_size, MPI_BYTE, MPI_SUM, + window)); +} + +void Racc_nolocalflush (void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) + SafeMpi (MPI_Raccumulate + (source, (MPI_Aint) source_size, + MPI_BYTE, target_rank, disp, + (MPI_Aint) target_size, MPI_BYTE, MPI_SUM, + window, &request)); +} void Iacc_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Op & op, Window & window) + Aint disp, int target_size, Op op, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) SafeMpi (MPI_Accumulate @@ -584,7 +607,7 @@ void Iacc_nolocalflush (void *source, int source_size, int target_rank, } void Racc_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Op & op, Window & window, + Aint disp, int target_size, Op op, Window & window, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) @@ -684,7 +707,7 @@ void Rget (void *source, int source_size, int target_rank, } void Iacc (void *source, int source_size, int target_rank, - Aint disp, int target_size, Op & op, Window & window) + Aint disp, int target_size, Op op, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) SafeMpi (MPI_Accumulate @@ -696,7 +719,7 @@ void Iacc (void *source, int source_size, int target_rank, } void Racc (void *source, int source_size, int target_rank, - Aint disp, int target_size, Op & op, Window & window, + Aint disp, int target_size, Op op, Window & window, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) From da57bb119a8467bdd9b4113f8591338213308e9c Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 16 Jul 2014 16:57:18 -0500 Subject: [PATCH 034/110] intermediate commit to fixing get --- src/core/RmaInterface.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 3a72641b75..cf36f11a10 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -301,7 +301,6 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) getVector_[destination].resize (bufferSize); byte *getBuffer = getVector_[destination].data (); - const T *XBuffer = reinterpret_cast(getBuffer); const Int colAlign = (X.ColAlign () + i) % r; const Int rowAlign = (X.RowAlign () + j) % c; const Int colShift = Shift (myRow, colAlign, r); @@ -312,21 +311,26 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) const Int iLocalOffset = Length (i, X.ColShift (), r); const Int jLocalOffset = Length (j, X.RowShift (), c); + // get for( Int t=0; t(getBuffer + t*localHeight*sizeof(T)), localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); - // update local matrix - T *YCol = Z.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] = XCol[s]; + } - // no difference between localflush + // no difference between localflush // and flush for Get mpi::FlushLocal (destination, window); + T* getData = reinterpret_cast(getBuffer); + // update local matrix + for( Int t=0; t::Acc( T alpha, Matrix& Z, Int i, Int j ) thisSendCol[s] = alpha*thisXCol[colShift+s*r]; // acc mpi::Aint disp = (iLocalOffset + (jLocalOffset+t) * Y.LDim ()) * sizeof(T); - mpi::Iacc_nolocalflush (reinterpret_cast(sendBuffer + t*localHeight*sizeof(T)), + mpi::Iput_nolocalflush (reinterpret_cast(sendBuffer + t*localHeight*sizeof(T)), localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); } // local flush, okay to clear buffers after this From be45ddf739f4e70454409aa90387a5bdf31dffee Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 16 Jul 2014 17:34:26 -0500 Subject: [PATCH 035/110] fixed get --- src/core/RmaInterface.cpp | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index cf36f11a10..93b366eebb 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -281,6 +281,9 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) const Int colAlign = (X.ColAlign() + i) % r; const Int rowAlign = (X.RowAlign() + j) % c; + + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -292,7 +295,7 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; @@ -301,16 +304,6 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) getVector_[destination].resize (bufferSize); byte *getBuffer = getVector_[destination].data (); - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); - // get for( Int t=0; t Date: Wed, 16 Jul 2014 18:41:02 -0500 Subject: [PATCH 036/110] trying to fix acc... --- include/El/core/imports/mpi.hpp | 5 ----- src/core/RmaInterface.cpp | 22 ++++++++++++++-------- src/core/imports/mpi.cpp | 25 +++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index b292fc0eb0..5c87808b71 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -37,11 +37,6 @@ namespace mpi { #define EL_USE_NONBLOCKING_CONSENSUS #endif -// TODO Give this a better name -#ifndef EL_PREFER_WAIT_OVER_TEST -#define EL_PREFER_WAIT_OVER_TEST -#endif - // Experimental MPI performance enhancers #ifndef EL_MPI_EXPERIMENTAL #define EL_MPI_EXPERIMENTAL diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 93b366eebb..71add9d2ea 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -310,7 +310,6 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) mpi::Aint disp = (iLocalOffset + (jLocalOffset+t) * X.LDim ()) * sizeof(T); mpi::Iget_nolocalflush (reinterpret_cast(getBuffer + t*localHeight*sizeof(T)), localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); - } // no difference between localflush // and flush for Get @@ -371,7 +370,10 @@ void RmaInterface::Acc( T alpha, Matrix& Z, Int i, Int j ) // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - + + const Int iLocalOffset = Length( i, Y.ColShift (), r ); + const Int jLocalOffset = Length( j, Y.RowShift (), c ); + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -383,9 +385,7 @@ void RmaInterface::Acc( T alpha, Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - // each PEs offset is different - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; @@ -396,19 +396,25 @@ void RmaInterface::Acc( T alpha, Matrix& Z, Int i, Int j ) T* sendData = reinterpret_cast(sendBuffer); const T* XBuffer = Z.LockedBuffer(); + //src *= scale for( Int t=0; t(sendBuffer + t*localHeight*sizeof(T)), + //mpi::Iacc_nolocalflush (reinterpret_cast(sendBuffer + t*localHeight*sizeof(T)), + // localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); + mpi::Iacc (reinterpret_cast(sendBuffer + t*localHeight*sizeof(T)), localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); } // local flush, okay to clear buffers after this - mpi::FlushLocal (destination, window); + //mpi::FlushLocal (destination, window); // clear putVector_[destination].resize (0); } diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index ce51de1115..f47e2538b0 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -731,6 +731,31 @@ void Racc (void *source, int source_size, int target_rank, SafeMpi (MPI_Win_flush_local (target_rank, window)); } +void Iacc (void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) + SafeMpi (MPI_Accumulate + (source, (MPI_Aint) source_size, + MPI_BYTE, target_rank, disp, + (MPI_Aint) target_size, MPI_BYTE, MPI_SUM, + window)); + SafeMpi (MPI_Win_flush_local (target_rank, window)); +} + +void Racc (void *source, int source_size, int target_rank, + Aint disp, int target_size, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) + SafeMpi (MPI_Raccumulate + (source, (MPI_Aint) source_size, + MPI_BYTE, target_rank, disp, + (MPI_Aint) target_size, MPI_BYTE, MPI_SUM, + window, &request)); + SafeMpi (MPI_Win_flush_local (target_rank, window)); +} + void FlushLocal (int target_rank, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::FlushLocal")) From b0411baf2186cf88db29175501b3195b4dffcf88 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 17 Jul 2014 20:05:42 -0500 Subject: [PATCH 037/110] modified one-sided interface significantly, using typemap everywhere instead of mpi_byte...added overloaded/modified interfaces --- include/El/core/imports/mpi.hpp | 123 +++-- src/core/RmaInterface.cpp | 43 +- src/core/imports/mpi.cpp | 800 ++++++++++++++++++++++++-------- 3 files changed, 713 insertions(+), 253 deletions(-) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 5c87808b71..846024af2a 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -192,7 +192,8 @@ void Translate ( Comm origComm, int size, const int* origRanks, Comm newComm, int* newRanks ); -//MPI-3 one-sided +// MPI-3 one-sided +// =============== #if MPI_VERSION>=3 // Utilities void SetWindowProp (Window& window, int prop); @@ -207,55 +208,83 @@ void WindowUnlock( Window& window ); void WindowCreate( void* baseptr, int size, Comm comm, Window& window ); void WindowFree (Window & window); // One-sided operations -// local flush present -void Iput( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window); -void Rput( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window, - Request& request); -void Iget( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window); -void Rget( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window, - Request& request); -void Iacc( void *source, int source_size, int target_rank, - Aint disp, int target_size, Op op, Window & window); -void Racc( void *source, int source_size, int target_rank, - Aint disp, int target_size, Op op, Window& window, - Request& request); -// use mpi::SUM -void Iacc( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window); -void Racc( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window, - Request& request); -// no local flush -void Iput_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window); -void Rput_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window, - Request& request); -void Iget_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window); -void Rget_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window, - Request& request); -void Iacc_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Op op, Window & window); -void Racc_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Op op, Window& window, - Request& request); -// use mpi::SUM -void Iacc_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window); -void Racc_nolocalflush( void *source, int source_size, int target_rank, - Aint disp, int target_size, Window& window, - Request& request); +// -------------------- +// put +// --- +template +void Iput (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Iput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Rput (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Rput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Iput( T source, int target_rank, Aint disp, Window& window ); +template +void Rput( T source, int target_rank, Aint disp, + Window& window, Request& request ); +// get +// --- +template +void Iget (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Iget (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Rget (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Rget (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Iget( T source, int target_rank, Aint disp, Window& window ); +template +void Rget( T source, int target_rank, Aint disp, + Window& window, Request& request ); +// acc +// --- +template +void Iacc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template +void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template +void Racc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, + Request & request); +template +void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, + Request & request); +template +void Iacc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Racc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Iacc (const T source, int target_rank, Aint disp, Window & window); +template +void Racc (const T source, int target_rank, Aint disp, Window & window, + Request & request); // Synchronization +// --------------- void Flush( int target_rank, Window& window ); -void Flush(Window & window); +void Flush( Window & window ); void FlushLocal( int target_rank, Window& window ); -void FlushLocal(Window & window); +void FlushLocal( Window & window ); #endif // Utilities diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 71add9d2ea..881a22382e 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -162,7 +162,7 @@ void RmaInterface::Attach( const DistMatrix& X ) //do rma related checks const Int numEntries = X.LocalHeight () * X.LocalWidth (); const Int bufferSize = numEntries * sizeof(T); - void* baseptr = (void*)X.LockedBuffer (); + void* baseptr = (void*)(X.LockedBuffer ()); assert (baseptr != NULL); mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); @@ -170,7 +170,7 @@ void RmaInterface::Attach( const DistMatrix& X ) } template -void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) +void RmaInterface::Put( T scale, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) @@ -203,6 +203,8 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); + + const Int YLDim = Y.LDim (); for( Int step=0; step::Put( T alpha, Matrix& Z, Int i, Int j ) T* thisSendCol = &sendData[t*localHeight]; const T* thisXCol = &XBuffer[(rowShift+t*c)*XLDim]; for( Int s=0; s(sendBuffer + t*localHeight*sizeof(T)), - localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); + mpi::Aint disp = iLocalOffset + (jLocalOffset+t) * YLDim; + mpi::Iput ((sendBuffer + t*localHeight), localHeight, + destination, disp, localHeight, window); } // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); @@ -246,7 +248,7 @@ void RmaInterface::Put( T alpha, Matrix& Z, Int i, Int j ) } template -void RmaInterface::Put( T alpha, const Matrix& Z, Int i, Int j ) +void RmaInterface::Put( T scale, const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) } @@ -284,6 +286,8 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) const Int iLocalOffset = Length (i, X.ColShift (), r); const Int jLocalOffset = Length (j, X.RowShift (), c); + + const Int XLDim = X.LDim (); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -307,9 +311,9 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) // get for( Int t=0; t(getBuffer + t*localHeight*sizeof(T)), - localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); + mpi::Aint disp = iLocalOffset + (jLocalOffset+t) * XLDim; + mpi::Iget ((getBuffer + t*localHeight), localHeight, + destination, disp, localHeight, window); } // no difference between localflush // and flush for Get @@ -338,10 +342,10 @@ void RmaInterface::Get( const Matrix& Z, Int i, Int j ) DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) } -// scaled accumulate = Update Y(i:i+height-1,j:j+width-1) += alpha X, +// scaled accumulate = Update Y(i:i+height-1,j:j+width-1) += scale X, // where X is height x width template -void RmaInterface::Acc( T alpha, Matrix& Z, Int i, Int j ) +void RmaInterface::Acc( T scale, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) @@ -367,6 +371,7 @@ void RmaInterface::Acc( T alpha, Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; const Int XLDim = Z.LDim(); + const Int YLDim = Y.LDim (); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); @@ -402,19 +407,17 @@ void RmaInterface::Acc( T alpha, Matrix& Z, Int i, Int j ) T* thisSendCol = &sendData[t*localHeight]; const T* thisXCol = &XBuffer[(rowShift+t*c)*XLDim]; for( Int s=0; s(sendBuffer + t*localHeight*sizeof(T)), - // localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); - mpi::Iacc (reinterpret_cast(sendBuffer + t*localHeight*sizeof(T)), - localHeight * sizeof(T), destination, disp, localHeight * sizeof(T), window); + mpi::Aint disp = iLocalOffset + (jLocalOffset+t) * YLDim; + mpi::Iacc ((sendBuffer + t*localHeight), localHeight, + destination, disp, localHeight, window); } // local flush, okay to clear buffers after this - //mpi::FlushLocal (destination, window); + mpi::FlushLocal (destination, window); // clear putVector_[destination].resize (0); } @@ -425,7 +428,7 @@ void RmaInterface::Acc( T alpha, Matrix& Z, Int i, Int j ) } template -void RmaInterface::Acc( T alpha, const Matrix& Z, Int i, Int j ) +void RmaInterface::Acc( T scale, const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) } diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index f47e2538b0..7e21e5601e 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -459,7 +459,7 @@ void WindowCreate (void *baseptr, int size, Comm comm, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::WindowCreate")) - // use alloc_shm + // TODO use alloc_shm SafeMpi (MPI_Win_create (baseptr, (MPI_Aint) size, 1, MPI_INFO_NULL, comm.comm, &window)); @@ -500,260 +500,689 @@ void WindowFree (Window & window) SafeMpi (MPI_Win_free (&window)); } -void Iput_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window) +// put +template +void Iput (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iput")) #ifdef EL_ENSURE_PUT_ATOMICITY SafeMpi (MPI_Accumulate - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, MPI_REPLACE, window)); + (const_cast(source), origin_count, TypeMap(), + target_rank, disp*sizeof(R), target_count, + TypeMap(), MPI_REPLACE, window)); #else SafeMpi (MPI_Put - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, window)); + (const_cast(source), origin_count, TypeMap(), + target_rank, disp*sizeof(R), target_count, + TypeMap(), window)); #endif } -void Rput_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window, - Request & request) +template +void Iput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Iput")) #ifdef EL_ENSURE_PUT_ATOMICITY - SafeMpi (MPI_Raccumulate - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, MPI_REPLACE, window, &request)); +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Accumulate + (const_cast*>(source), 2*origin_count, TypeMap(), + target_rank, 2*disp*sizeof(R), 2*target_count, + TypeMap(), MPI_REPLACE, window)); #else - SafeMpi (MPI_Rput - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, window, &request)); + SafeMpi (MPI_Accumulate + (const_cast*>(source), origin_count, TypeMap>(), + target_rank, disp*sizeof(Complex), target_count, + TypeMap>(), MPI_REPLACE, window)); #endif -} - -void Iget_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) -#ifdef EL_ENSURE_GET_ATOMICITY - SafeMpi (MPI_Get_accumulate - (NULL, 0, MPI_BYTE, source, - (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, MPI_NO_OP, window)); #else - SafeMpi (MPI_Get - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, window)); -#endif -} - -void Rget_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window, - Request & request) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) -#ifdef EL_ENSURE_GET_ATOMICITY - SafeMpi (MPI_Rget_accumulate - (NULL, 0, MPI_BYTE, source, - (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, MPI_NO_OP, window, &request)); +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Put + (const_cast*>(source), 2*origin_count, TypeMap(), + target_rank, 2*disp*sizeof(R), 2*target_count, + TypeMap(), window)); #else - SafeMpi (MPI_Rget - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, window, &request)); + SafeMpi (MPI_Put + (const_cast*>(source), origin_count, TypeMap>(), + target_rank, disp*sizeof(Complex), target_count, + TypeMap>(), window)); +#endif #endif -} -// use mpi::SUM by default -void Iacc_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) - SafeMpi (MPI_Accumulate - (source, (MPI_Aint) source_size, - MPI_BYTE, target_rank, disp, - (MPI_Aint) target_size, MPI_BYTE, MPI_SUM, - window)); -} - -void Racc_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window, - Request & request) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) - SafeMpi (MPI_Raccumulate - (source, (MPI_Aint) source_size, - MPI_BYTE, target_rank, disp, - (MPI_Aint) target_size, MPI_BYTE, MPI_SUM, - window, &request)); -} - -void Iacc_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Op op, Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) - SafeMpi (MPI_Accumulate - (source, (MPI_Aint) source_size, - MPI_BYTE, target_rank, disp, - (MPI_Aint) target_size, MPI_BYTE, op.op, - window)); -} - -void Racc_nolocalflush (void *source, int source_size, int target_rank, - Aint disp, int target_size, Op op, Window & window, - Request & request) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) - SafeMpi (MPI_Raccumulate - (source, (MPI_Aint) source_size, - MPI_BYTE, target_rank, disp, - (MPI_Aint) target_size, MPI_BYTE, op.op, - window, &request)); -} - -void Flush (int target_rank, Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) - SafeMpi (MPI_Win_flush (target_rank, window)); -} - -void Flush (Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) - SafeMpi (MPI_Win_flush_all (window)); } -void Iput (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window) +template +void Rput (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Iput")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) #ifdef EL_ENSURE_PUT_ATOMICITY - SafeMpi (MPI_Accumulate - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, MPI_REPLACE, window)); + SafeMpi (MPI_Raccumulate + (const_cast(source), origin_count, TypeMap(), + target_rank, disp*sizeof(R), target_count, + TypeMap(), MPI_REPLACE, window, &request)); #else - SafeMpi (MPI_Put - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, window)); + SafeMpi (MPI_Rput + (const_cast(source), origin_count, TypeMap(), + target_rank, disp*sizeof(R), target_count, + TypeMap(), window, &request)); #endif - SafeMpi (MPI_Win_flush_local (target_rank, window)); } -void Rput (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window, - Request & request) +template +void Rput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) #ifdef EL_ENSURE_PUT_ATOMICITY +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Raccumulate + (const_cast*>(source), 2*origin_count, TypeMap(), + target_rank, 2*disp*sizeof(R), 2*target_count, + TypeMap(), MPI_REPLACE, window, &request)); +#else SafeMpi (MPI_Raccumulate - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, MPI_REPLACE, window, &request)); + (const_cast*>(source), origin_count, TypeMap>(), + target_rank, disp*sizeof(Complex), target_count, + TypeMap>(), MPI_REPLACE, window, &request)); +#endif +#else +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Rput + (const_cast*>(source), 2*origin_count, TypeMap(), + target_rank, 2*disp*sizeof(R), 2*target_count, + TypeMap(), window, &request)); #else SafeMpi (MPI_Rput - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, window, &request)); + (const_cast*>(source), origin_count, TypeMap>(), + target_rank, disp*sizeof(Complex), target_count, + TypeMap>(), window, &request)); +#endif #endif - SafeMpi (MPI_Win_flush_local (target_rank, window)); } -void Iget (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window) +template void Iput (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iput (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#endif +template void Iput (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); + +template void Rput (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Rput (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#endif +template void Rput (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); + +// when source-target size == 1 +template +void Iput( T source, int target_rank, Aint disp, Window& window ) +{ Iput ( &source, 1, target_rank, disp, 1, window ); } + +template +void Rput( T source, int target_rank, Aint disp, + Window& window, Request& request ) +{ Rput ( &source, 1, target_rank, disp, 1, window, request ); } + +template void Rput (const byte source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const unsigned source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const unsigned long source, int target_rank, + Aint disp, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Rput (const long long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const unsigned long long source, int target_rank, + Aint disp, Window & window, Request & request); +#endif +template void Rput (const float source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const double source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); + +template void Iput (const byte source, int target_rank, + Aint disp, Window & window); +template void Iput (const int source, int target_rank, + Aint disp, Window & window); +template void Iput (const unsigned source, int target_rank, + Aint disp, Window & window); +template void Iput (const long int source, int target_rank, + Aint disp, Window & window); +template void Iput (const unsigned long source, int target_rank, + Aint disp, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iput (const long long int source, int target_rank, + Aint disp, Window & window); +template void Iput (const unsigned long long source, int target_rank, + Aint disp, Window & window); +#endif +template void Iput (const float source, int target_rank, + Aint disp, Window & window); +template void Iput (const double source, int target_rank, + Aint disp, Window & window); +template void Iput (const Complex source, int target_rank, + Aint disp, Window & window); +template void Iput (const Complex source, int target_rank, + Aint disp, Window & window); +// get +template +void Iget (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) #ifdef EL_ENSURE_GET_ATOMICITY SafeMpi (MPI_Get_accumulate - (NULL, 0, MPI_BYTE, source, - (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, MPI_NO_OP, window)); + (NULL, 0, TypeMap(), const_cast(source), + origin_count, TypeMap(), + target_rank, disp*sizeof(R), target_count, + TypeMap(), MPI_NO_OP, window)); #else SafeMpi (MPI_Get - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, window)); + (const_cast(source), origin_count, TypeMap(), + target_rank, disp*sizeof(R), target_count, + TypeMap(), window)); #endif - SafeMpi (MPI_Win_flush_local (target_rank, window)); } -void Rget (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window, +template +void Iget (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) +#ifdef EL_ENSURE_GET_ATOMICITY +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Get_accumulate + (NULL, 0, TypeMap(), const_cast*>(source), + 2*origin_count, TypeMap(), + target_rank, 2*disp*sizeof(R), 2*target_count, + TypeMap(), MPI_NO_OP, window)); +#else + SafeMpi (MPI_Get_accumulate + (NULL, 0, TypeMap>(), const_cast*>(source), + origin_count, TypeMap>(), + target_rank, disp*sizeof(Complex), target_count, + TypeMap>(), MPI_NO_OP, window)); +#endif +#else +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Get + (const_cast*>(source), 2*origin_count, TypeMap(), + target_rank, 2*disp*sizeof(R), 2*target_count, + TypeMap(), window)); +#else + SafeMpi (MPI_Get + (const_cast*>(source), origin_count, TypeMap>(), + target_rank, disp*sizeof(Complex), target_count, + TypeMap>(), window)); +#endif +#endif +} + +template +void Rget (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) #ifdef EL_ENSURE_GET_ATOMICITY SafeMpi (MPI_Rget_accumulate - (NULL, 0, MPI_BYTE, source, - (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, MPI_NO_OP, window, &request)); + (NULL, 0, TypeMap(), const_cast(source), + origin_count, TypeMap(), + target_rank, disp*sizeof(R), target_count, + TypeMap(), MPI_NO_OP, window, + &request)); #else SafeMpi (MPI_Rget - (source, (MPI_Aint) source_size, MPI_BYTE, - target_rank, disp, (MPI_Aint) target_size, - MPI_BYTE, window, &request)); + (const_cast(source), origin_count, TypeMap(), + target_rank, disp*sizeof(R), target_count, + TypeMap(), window, &request)); #endif - SafeMpi (MPI_Win_flush_local (target_rank, window)); } -void Iacc (void *source, int source_size, int target_rank, - Aint disp, int target_size, Op op, Window & window) +template +void Rget (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) +#ifdef EL_ENSURE_GET_ATOMICITY +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Rget_accumulate + (NULL, 0, TypeMap(), const_cast*>(source), + 2*origin_count, TypeMap(), + target_rank, 2*disp*sizeof(R), 2*target_count, + TypeMap(), MPI_NO_OP, window, &request)); +#else + SafeMpi (MPI_Rget_accumulate + (NULL, 0, TypeMap>(), const_cast*>(source), + origin_count, TypeMap>(), + target_rank, disp*sizeof(Complex), target_count, + TypeMap>(), MPI_NO_OP, window, &request)); +#endif +#else +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Rget + (const_cast*>(source), 2*origin_count, TypeMap(), + target_rank, 2*disp*sizeof(R), 2*target_count, + TypeMap(), window, &request)); +#else + SafeMpi (MPI_Rget + (const_cast*>(source), origin_count, TypeMap>(), + target_rank, disp*sizeof(Complex), target_count, + TypeMap>(), window, &request)); +#endif +#endif +} + +template void Iget (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iget (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#endif +template void Iget (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); + +template void Rget (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Rget (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#endif +template void Rget (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); + +// when source-target size == 1 +template +void Iget( T source, int target_rank, Aint disp, Window& window ) +{ Iget ( &source, 1, target_rank, disp, 1, window ); } + +template +void Rget( T source, int target_rank, Aint disp, + Window& window, Request& request ) +{ Rget ( &source, 1, target_rank, disp, 1, window, request ); } + +template void Rget (const byte source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (const int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (const unsigned source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (const long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (const unsigned long source, int target_rank, + Aint disp, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Rget (const long long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (const unsigned long long source, int target_rank, + Aint disp, Window & window, Request & request); +#endif +template void Rget (const float source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (const double source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); + +template void Iget (const byte source, int target_rank, + Aint disp, Window & window); +template void Iget (const int source, int target_rank, + Aint disp, Window & window); +template void Iget (const unsigned source, int target_rank, + Aint disp, Window & window); +template void Iget (const long int source, int target_rank, + Aint disp, Window & window); +template void Iget (const unsigned long source, int target_rank, + Aint disp, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iget (const long long int source, int target_rank, + Aint disp, Window & window); +template void Iget (const unsigned long long source, int target_rank, + Aint disp, Window & window); +#endif +template void Iget (const float source, int target_rank, + Aint disp, Window & window); +template void Iget (const double source, int target_rank, + Aint disp, Window & window); +template void Iget (const Complex source, int target_rank, + Aint disp, Window & window); +template void Iget (const Complex source, int target_rank, + Aint disp, Window & window); + +// acc +template +void Iacc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) SafeMpi (MPI_Accumulate - (source, (MPI_Aint) source_size, - MPI_BYTE, target_rank, disp, - (MPI_Aint) target_size, MPI_BYTE, op.op, + (const_cast(source), origin_count, + TypeMap(), target_rank, disp*sizeof(R), + target_count, TypeMap(), op.op, window)); - SafeMpi (MPI_Win_flush_local (target_rank, window)); } -void Racc (void *source, int source_size, int target_rank, - Aint disp, int target_size, Op op, Window & window, +template +void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Accumulate + (const_cast*>(source), 2*origin_count, + TypeMap(), target_rank, disp*sizeof(R), + 2*target_count, TypeMap(), op.op, + window)); +#else + SafeMpi (MPI_Accumulate + (const_cast*>(source), origin_count, + TypeMap>(), target_rank, disp*sizeof(Complex), + target_count, TypeMap>(), op.op, + window)); +#endif +} + +template +void Racc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) SafeMpi (MPI_Raccumulate - (source, (MPI_Aint) source_size, - MPI_BYTE, target_rank, disp, - (MPI_Aint) target_size, MPI_BYTE, op.op, + (const_cast(source), origin_count, + TypeMap(), target_rank, disp*sizeof(R), + target_count, TypeMap(), op.op, window, &request)); - SafeMpi (MPI_Win_flush_local (target_rank, window)); } -void Iacc (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) - SafeMpi (MPI_Accumulate - (source, (MPI_Aint) source_size, - MPI_BYTE, target_rank, disp, - (MPI_Aint) target_size, MPI_BYTE, MPI_SUM, - window)); - SafeMpi (MPI_Win_flush_local (target_rank, window)); -} - -void Racc (void *source, int source_size, int target_rank, - Aint disp, int target_size, Window & window, +template +void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) +#ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Raccumulate - (source, (MPI_Aint) source_size, - MPI_BYTE, target_rank, disp, - (MPI_Aint) target_size, MPI_BYTE, MPI_SUM, + (const_cast*>(source), 2*origin_count, + TypeMap(), target_rank, disp*sizeof(R), + 2*target_count, TypeMap(), op.op, window, &request)); - SafeMpi (MPI_Win_flush_local (target_rank, window)); +#else + SafeMpi (MPI_Raccumulate + (const_cast*>(source), origin_count, + TypeMap>(), target_rank, disp*sizeof(Complex), + target_count, TypeMap>(), op.op, + window, &request)); +#endif +} + +template void Iacc (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iacc (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +#endif +template void Iacc (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); + +template void Racc (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Racc (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +#endif +template void Racc (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); + +// op = SUM +template +void Iacc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) +{ Iacc ( source, origin_count, target_rank, disp, target_count, SUM, window ); } + +template +void Racc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) +{ Racc ( source, origin_count, target_rank, disp, target_count, SUM, window, request ); } + +template void Iacc (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iacc (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#endif +template void Iacc (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); + +template void Racc (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Racc (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#endif +template void Racc (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); + +// when source-target size == 1 and op = SUM +template +void Iacc (const T source, int target_rank, Aint disp, Window & window) +{ Iacc ( &source, 1, target_rank, disp, 1, SUM, window ); } + +template +void Racc (const T source, int target_rank, Aint disp, Window & window, + Request & request) +{ Racc ( &source, 1, target_rank, disp, 1, SUM, window, request ); } + +template void Racc (const byte source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const unsigned source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const unsigned long source, int target_rank, + Aint disp, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Racc (const long long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const unsigned long long source, int target_rank, + Aint disp, Window & window, Request & request); +#endif +template void Racc (const float source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const double source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); + +template void Iacc (const byte source, int target_rank, + Aint disp, Window & window); +template void Iacc (const int source, int target_rank, + Aint disp, Window & window); +template void Iacc (const unsigned source, int target_rank, + Aint disp, Window & window); +template void Iacc (const long int source, int target_rank, + Aint disp, Window & window); +template void Iacc (const unsigned long source, int target_rank, + Aint disp, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iacc (const long long int source, int target_rank, + Aint disp, Window & window); +template void Iacc (const unsigned long long source, int target_rank, + Aint disp, Window & window); +#endif +template void Iacc (const float source, int target_rank, + Aint disp, Window & window); +template void Iacc (const double source, int target_rank, + Aint disp, Window & window); +template void Iacc (const Complex source, int target_rank, + Aint disp, Window & window); +template void Iacc (const Complex source, int target_rank, + Aint disp, Window & window); + +// Synchronization +// --------------- +void Flush (int target_rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) + SafeMpi (MPI_Win_flush (target_rank, window)); +} + +void Flush (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) + SafeMpi (MPI_Win_flush_all (window)); } void FlushLocal (int target_rank, Window & window) @@ -767,7 +1196,6 @@ void FlushLocal (Window & window) DEBUG_ONLY (CallStackEntry cse ("mpi::FlushLocal")) SafeMpi (MPI_Win_flush_local_all (window)); } - #endif // Various utilities From c18aab41104aa51c8cd6b5c8f1831db7fd0f3b02 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 21 Jul 2014 11:40:59 -0500 Subject: [PATCH 038/110] fixing rma interface - intmd commit --- src/core/RmaInterface.cpp | 4 +- src/core/imports/mpi.cpp | 82 +++++++++++++++++++++------------------ 2 files changed, 47 insertions(+), 39 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 881a22382e..1020640c77 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -412,8 +412,8 @@ void RmaInterface::Acc( T scale, Matrix& Z, Int i, Int j ) // acc for( Int t=0; t(source), origin_count, TypeMap(), - target_rank, disp*sizeof(R), target_count, + target_rank, disp, target_count, TypeMap(), MPI_REPLACE, window)); #else SafeMpi (MPI_Put (const_cast(source), origin_count, TypeMap(), - target_rank, disp*sizeof(R), target_count, + target_rank, disp, target_count, TypeMap(), window)); #endif } @@ -528,24 +528,24 @@ void Iput (const Complex* source, int origin_count, int target_rank, #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Accumulate (const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, 2*disp*sizeof(R), 2*target_count, + target_rank, disp, 2*target_count, TypeMap(), MPI_REPLACE, window)); #else SafeMpi (MPI_Accumulate (const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp*sizeof(Complex), target_count, + target_rank, disp, target_count, TypeMap>(), MPI_REPLACE, window)); #endif #else #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Put (const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, 2*disp*sizeof(R), 2*target_count, + target_rank, disp, 2*target_count, TypeMap(), window)); #else SafeMpi (MPI_Put (const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp*sizeof(Complex), target_count, + target_rank, disp, target_count, TypeMap>(), window)); #endif #endif @@ -560,12 +560,12 @@ void Rput (const R* source, int origin_count, int target_rank, #ifdef EL_ENSURE_PUT_ATOMICITY SafeMpi (MPI_Raccumulate (const_cast(source), origin_count, TypeMap(), - target_rank, disp*sizeof(R), target_count, + target_rank, disp, target_count, TypeMap(), MPI_REPLACE, window, &request)); #else SafeMpi (MPI_Rput (const_cast(source), origin_count, TypeMap(), - target_rank, disp*sizeof(R), target_count, + target_rank, disp, target_count, TypeMap(), window, &request)); #endif } @@ -580,29 +580,28 @@ void Rput (const Complex* source, int origin_count, int target_rank, #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Raccumulate (const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, 2*disp*sizeof(R), 2*target_count, + target_rank, disp, 2*target_count, TypeMap(), MPI_REPLACE, window, &request)); #else SafeMpi (MPI_Raccumulate (const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp*sizeof(Complex), target_count, + target_rank, disp, target_count, TypeMap>(), MPI_REPLACE, window, &request)); #endif #else #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Rput (const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, 2*disp*sizeof(R), 2*target_count, + target_rank, disp, 2*target_count, TypeMap(), window, &request)); #else SafeMpi (MPI_Rput (const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp*sizeof(Complex), target_count, + target_rank, disp, target_count, TypeMap>(), window, &request)); #endif #endif } - template void Iput (const byte* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window); template void Iput (const int* source, int origin_count, int target_rank, @@ -722,12 +721,12 @@ void Iget (const R* source, int origin_count, int target_rank, SafeMpi (MPI_Get_accumulate (NULL, 0, TypeMap(), const_cast(source), origin_count, TypeMap(), - target_rank, disp*sizeof(R), target_count, + target_rank, disp, target_count, TypeMap(), MPI_NO_OP, window)); #else SafeMpi (MPI_Get (const_cast(source), origin_count, TypeMap(), - target_rank, disp*sizeof(R), target_count, + target_rank, disp, target_count, TypeMap(), window)); #endif } @@ -742,25 +741,25 @@ void Iget (const Complex* source, int origin_count, int target_rank, SafeMpi (MPI_Get_accumulate (NULL, 0, TypeMap(), const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, 2*disp*sizeof(R), 2*target_count, + target_rank, disp, 2*target_count, TypeMap(), MPI_NO_OP, window)); #else SafeMpi (MPI_Get_accumulate (NULL, 0, TypeMap>(), const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp*sizeof(Complex), target_count, + target_rank, disp, target_count, TypeMap>(), MPI_NO_OP, window)); #endif #else #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Get (const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, 2*disp*sizeof(R), 2*target_count, + target_rank, disp, 2*target_count, TypeMap(), window)); #else SafeMpi (MPI_Get (const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp*sizeof(Complex), target_count, + target_rank, disp, target_count, TypeMap>(), window)); #endif #endif @@ -776,13 +775,13 @@ void Rget (const R* source, int origin_count, int target_rank, SafeMpi (MPI_Rget_accumulate (NULL, 0, TypeMap(), const_cast(source), origin_count, TypeMap(), - target_rank, disp*sizeof(R), target_count, + target_rank, disp, target_count, TypeMap(), MPI_NO_OP, window, &request)); #else SafeMpi (MPI_Rget (const_cast(source), origin_count, TypeMap(), - target_rank, disp*sizeof(R), target_count, + target_rank, disp, target_count, TypeMap(), window, &request)); #endif } @@ -798,30 +797,29 @@ void Rget (const Complex* source, int origin_count, int target_rank, SafeMpi (MPI_Rget_accumulate (NULL, 0, TypeMap(), const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, 2*disp*sizeof(R), 2*target_count, + target_rank, disp, 2*target_count, TypeMap(), MPI_NO_OP, window, &request)); #else SafeMpi (MPI_Rget_accumulate (NULL, 0, TypeMap>(), const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp*sizeof(Complex), target_count, + target_rank, disp, target_count, TypeMap>(), MPI_NO_OP, window, &request)); #endif #else #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Rget (const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, 2*disp*sizeof(R), 2*target_count, + target_rank, disp, 2*target_count, TypeMap(), window, &request)); #else SafeMpi (MPI_Rget (const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp*sizeof(Complex), target_count, + target_rank, disp, target_count, TypeMap>(), window, &request)); #endif #endif } - template void Iget (const byte* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window); template void Iget (const int* source, int origin_count, int target_rank, @@ -938,11 +936,22 @@ void Iacc (const R* source, int origin_count, int target_rank, Aint disp, int target_count, Op op, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) - SafeMpi (MPI_Accumulate - (const_cast(source), origin_count, - TypeMap(), target_rank, disp*sizeof(R), - target_count, TypeMap(), op.op, - window)); + if (TypeMap() == TypeMap()) + { + SafeMpi (MPI_Accumulate + (const_cast(source), origin_count, + TypeMap(), target_rank, disp, + target_count, TypeMap(), op.op, + window)); + } + else + { + SafeMpi (MPI_Accumulate + (const_cast(source), origin_count, + TypeMap(), target_rank, disp, + target_count, TypeMap(), op.op, + window)); + } } template @@ -953,13 +962,13 @@ void Iacc (const Complex* source, int origin_count, int target_rank, #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Accumulate (const_cast*>(source), 2*origin_count, - TypeMap(), target_rank, disp*sizeof(R), + TypeMap(), target_rank, disp, 2*target_count, TypeMap(), op.op, window)); #else SafeMpi (MPI_Accumulate (const_cast*>(source), origin_count, - TypeMap>(), target_rank, disp*sizeof(Complex), + TypeMap>(), target_rank, disp, target_count, TypeMap>(), op.op, window)); #endif @@ -973,7 +982,7 @@ void Racc (const R* source, int origin_count, int target_rank, DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) SafeMpi (MPI_Raccumulate (const_cast(source), origin_count, - TypeMap(), target_rank, disp*sizeof(R), + TypeMap(), target_rank, disp, target_count, TypeMap(), op.op, window, &request)); } @@ -987,18 +996,17 @@ void Racc (const Complex* source, int origin_count, int target_rank, #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Raccumulate (const_cast*>(source), 2*origin_count, - TypeMap(), target_rank, disp*sizeof(R), + TypeMap(), target_rank, disp, 2*target_count, TypeMap(), op.op, window, &request)); #else SafeMpi (MPI_Raccumulate (const_cast*>(source), origin_count, - TypeMap>(), target_rank, disp*sizeof(Complex), + TypeMap>(), target_rank, disp, target_count, TypeMap>(), op.op, window, &request)); #endif } - template void Iacc (const byte* source, int origin_count, int target_rank, Aint disp, int target_count, Op op, Window & window); template void Iacc (const int* source, int origin_count, int target_rank, From 3a4ca3502b1735501633d39149e53ae8ed79c784 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 21 Jul 2014 15:43:33 -0500 Subject: [PATCH 039/110] removed const_cast and changed type of put-get vectors from byte to T --- include/El/core/RmaInterface.hpp | 5 +- include/El/core/imports/mpi.hpp | 8 +- src/core/RmaInterface.cpp | 257 ++++++------- src/core/imports/mpi.cpp | 633 ++++++++++++++++--------------- 4 files changed, 450 insertions(+), 453 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index f958e74631..7f43a6bd95 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -50,9 +50,7 @@ class RmaInterface private: mpi::Window window; - //std::vector>> - // getVector_, putVector_; - std::vector> + std::vector> getVector_, putVector_; DistMatrix* GlobalArrayPut_; @@ -60,7 +58,6 @@ class RmaInterface bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; }; - } // namespace El #endif #endif // ifndef EL_RMAINTERFACE_HPP diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 846024af2a..43a56a03bb 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -233,17 +233,17 @@ void Rput( T source, int target_rank, Aint disp, // get // --- template -void Iget (const R* source, int origin_count, int target_rank, +void Iget (R* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window); template -void Iget (const Complex* source, int origin_count, int target_rank, +void Iget (Complex* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window); template -void Rget (const R* source, int origin_count, int target_rank, +void Rget (R* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window, Request & request); template -void Rget (const Complex* source, int origin_count, int target_rank, +void Rget (Complex* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window, Request & request); template diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 1020640c77..803004e923 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -18,9 +18,7 @@ which can be found in the LICENSE file in the root directory, or at // TODO Complete the const interfaces... // TODO RMA related checks pending (e.g bounds checking)... // TODO Consider DDT -// TODO Add disp as a parameter to MPI one sided functions? // TODO Use DEBUG_ONLY or something that EL provides instead of assert -// TODO Make variable names rma friendly #if MPI_VERSION>=3 namespace El { @@ -110,15 +108,15 @@ void RmaInterface::Attach( DistMatrix& Z ) GlobalArrayPut_ = &Z; toBeAttachedForPut_ = true; GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; + toBeAttachedForGet_ = true; } const Grid& g = Z.Grid(); const Int p = g.Size (); - + if (putVector_.size() != p) { - getVector_.resize( p ); - putVector_.resize( p ); + getVector_.resize( p ); + putVector_.resize( p ); } // do rma related checks @@ -155,8 +153,8 @@ void RmaInterface::Attach( const DistMatrix& X ) if (putVector_.size() != p) { - getVector_.resize( p ); - putVector_.resize( p ); + getVector_.resize( p ); + putVector_.resize( p ); } //do rma related checks @@ -174,15 +172,15 @@ void RmaInterface::Put( T scale, Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; //do rma related checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError("Submatrix out of bounds of global matrix"); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -193,7 +191,7 @@ void RmaInterface::Put( T scale, Matrix& Z, Int i, Int j ) const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - const Int XLDim = Z.LDim(); + const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); @@ -203,47 +201,46 @@ void RmaInterface::Put( T scale, Matrix& Z, Int i, Int j ) const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - + const Int YLDim = Y.LDim (); for( Int step=0; step(sendBuffer); - const T* XBuffer = Z.LockedBuffer(); - - for( Int t=0; t(sendBuffer); + const T* XBuffer = Z.LockedBuffer(); + + for( Int t=0; t::Get( Matrix& Z, Int i, Int j ) const Int colAlign = (X.ColAlign() + i) % r; const Int rowAlign = (X.RowAlign() + j) % c; - + const Int iLocalOffset = Length (i, X.ColShift (), r); const Int jLocalOffset = Length (j, X.RowShift (), c); - + const Int XLDim = X.LDim (); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - + for( Int step=0; step::Get( Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int bufferSize = numEntries * sizeof(T); - - getVector_[destination].resize (bufferSize); - byte *getBuffer = getVector_[destination].data (); - - // get - for( Int t=0; t(getBuffer); - // update local matrix - for( Int t=0; t(getBuffer); + // update local matrix + for( Int t=0; t::Get( const Matrix& Z, Int i, Int j ) DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) } -// scaled accumulate = Update Y(i:i+height-1,j:j+width-1) += scale X, +// scaled accumulate = Update Y(i:i+height-1,j:j+width-1) += scale X, // where X is height x width template void RmaInterface::Acc( T scale, Matrix& Z, Int i, Int j ) @@ -369,61 +365,60 @@ void RmaInterface::Acc( T scale, Matrix& Z, Int i, Int j ) const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int XLDim = Z.LDim(); + + const Int XLDim = Z.LDim(); const Int YLDim = Y.LDim (); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - + const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; for( Int step=0; step(sendBuffer); - const T* XBuffer = Z.LockedBuffer(); - - //src *= scale - for( Int t=0; t(sendBuffer); + const T* XBuffer = Z.LockedBuffer(); + + //src *= scale + for( Int t=0; t::Flush( const Matrix& Z, Int i, Int j ) } } -// Are these only useful when the user wants to -// get/put the entire DistMatrix to it's local +// Are these only useful when the user wants to +// get/put the entire DistMatrix to it's local // PE/everyone in world ? template void RmaInterface::Flush( Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); DistMatrix& Y = *GlobalArrayPut_; - + //do rma related checks const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -632,7 +627,7 @@ void RmaInterface::Detach() DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) // destructor will call detach again... if (detached_) - return; + return; if( !attached_ ) LogicError("Must attach before detaching."); @@ -641,7 +636,7 @@ void RmaInterface::Detach() GlobalArrayGet_->Grid() ); mpi::Barrier( g.VCComm() ); - + attached_ = false; detached_ = true; toBeAttachedForPut_ = false; diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 20c88bcdc2..48037147f7 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -461,43 +461,43 @@ void WindowCreate (void *baseptr, int size, Comm comm, Window & window) // TODO use alloc_shm SafeMpi (MPI_Win_create - (baseptr, (MPI_Aint) size, 1, MPI_INFO_NULL, - comm.comm, &window)); + (baseptr, (MPI_Aint) size, 1, MPI_INFO_NULL, + comm.comm, &window)); #ifdef EL_NO_ACC_ORDERING SetWindowProp (window, NO_ACC_ORDERING); #endif } -void CheckBounds (Window & window, Datatype win_type, Datatype type, -size_t count, ptrdiff_t target_offset) +void CheckBounds (Window & window, Datatype win_type, Datatype type, + size_t count, ptrdiff_t target_offset) { - int flag, type_size, win_type_size; - size_t displ; - void * dest=NULL; + int flag, type_size, win_type_size; + size_t displ; + void * dest=NULL; - SafeMpi (MPI_Type_size (type, &type_size)); - SafeMpi (MPI_Type_size (win_type, &win_type_size)); - Aint lb, extent; + SafeMpi (MPI_Type_size (type, &type_size)); + SafeMpi (MPI_Type_size (win_type, &win_type_size)); + Aint lb, extent; - SafeMpi (MPI_Win_get_attr(window, MPI_WIN_BASE, dest, &flag /* unused */)); + SafeMpi (MPI_Win_get_attr(window, MPI_WIN_BASE, dest, &flag /* unused */)); - /* Calculate displacement from beginning of the window */ - if (dest == MPI_BOTTOM) - displ = 0; - else - displ = (size_t) ((uint8_t*)((uint8_t*)dest + target_offset * type_size) - (uint8_t*)dest); + /* Calculate displacement from beginning of the window */ + if (dest == MPI_BOTTOM) + displ = 0; + else + displ = (size_t) ((uint8_t*)((uint8_t*)dest + target_offset * type_size) - (uint8_t*)dest); - SafeMpi (MPI_Type_get_true_extent(type, &lb, &extent)); + SafeMpi (MPI_Type_get_true_extent(type, &lb, &extent)); - // invalid remote address - assert (displ >= 0 && displ < win_type_size); - // transfer out of range - assert (displ + count*extent <= win_type_size); + // invalid remote address + assert (displ >= 0 && displ < win_type_size); + // transfer out of range + assert (displ + count*extent <= win_type_size); } void WindowFree (Window & window) { - SafeMpi (MPI_Win_free (&window)); + SafeMpi (MPI_Win_free (&window)); } // put @@ -508,14 +508,14 @@ void Iput (const R* source, int origin_count, int target_rank, DEBUG_ONLY (CallStackEntry cse ("mpi::Iput")) #ifdef EL_ENSURE_PUT_ATOMICITY SafeMpi (MPI_Accumulate - (const_cast(source), origin_count, TypeMap(), + (source, origin_count, TypeMap(), target_rank, disp, target_count, TypeMap(), MPI_REPLACE, window)); #else SafeMpi (MPI_Put - (const_cast(source), origin_count, TypeMap(), - target_rank, disp, target_count, - TypeMap(), window)); + (source, origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), window)); #endif } @@ -527,225 +527,229 @@ void Iput (const Complex* source, int origin_count, int target_rank, #ifdef EL_ENSURE_PUT_ATOMICITY #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Accumulate - (const_cast*>(source), 2*origin_count, TypeMap(), + (source, 2*origin_count, TypeMap(), target_rank, disp, 2*target_count, TypeMap(), MPI_REPLACE, window)); #else SafeMpi (MPI_Accumulate - (const_cast*>(source), origin_count, TypeMap>(), + (source, origin_count, TypeMap>(), target_rank, disp, target_count, TypeMap>(), MPI_REPLACE, window)); #endif #else #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Put - (const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, disp, 2*target_count, - TypeMap(), window)); + (source, 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), window)); #else SafeMpi (MPI_Put - (const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp, target_count, - TypeMap>(), window)); + (source, origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), window)); #endif #endif } template void Rput (const R* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, - Request & request) + Aint disp, int target_count, Window & window, + Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) #ifdef EL_ENSURE_PUT_ATOMICITY SafeMpi (MPI_Raccumulate - (const_cast(source), origin_count, TypeMap(), + (source, origin_count, TypeMap(), target_rank, disp, target_count, TypeMap(), MPI_REPLACE, window, &request)); #else SafeMpi (MPI_Rput - (const_cast(source), origin_count, TypeMap(), - target_rank, disp, target_count, - TypeMap(), window, &request)); + (source, origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), window, &request)); #endif } template void Rput (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, - Request & request) + Aint disp, int target_count, Window & window, + Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) #ifdef EL_ENSURE_PUT_ATOMICITY #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Raccumulate - (const_cast*>(source), 2*origin_count, TypeMap(), + (source, 2*origin_count, TypeMap(), target_rank, disp, 2*target_count, TypeMap(), MPI_REPLACE, window, &request)); #else SafeMpi (MPI_Raccumulate - (const_cast*>(source), origin_count, TypeMap>(), + (source, origin_count, TypeMap>(), target_rank, disp, target_count, TypeMap>(), MPI_REPLACE, window, &request)); #endif #else #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Rput - (const_cast*>(source), 2*origin_count, TypeMap(), - target_rank, disp, 2*target_count, - TypeMap(), window, &request)); + (source, 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), window, &request)); #else SafeMpi (MPI_Rput - (const_cast*>(source), origin_count, TypeMap>(), - target_rank, disp, target_count, - TypeMap>(), window, &request)); + (source, origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), window, &request)); #endif #endif } template void Iput (const byte* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iput (const int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iput (const unsigned* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iput (const long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iput (const unsigned long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); #ifdef EL_HAVE_MPI_LONG_LONG template void Iput (const long long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iput (const unsigned long long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); #endif template void Iput (const float* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iput (const double* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iput (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iput (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Rput (const byte* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Rput (const int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Rput (const unsigned* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Rput (const long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Rput (const unsigned long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG template void Rput (const long long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Rput (const unsigned long long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); #endif template void Rput (const float* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Rput (const double* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Rput (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Rput (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); // when source-target size == 1 template void Iput( T source, int target_rank, Aint disp, Window& window ) -{ Iput ( &source, 1, target_rank, disp, 1, window ); } +{ + Iput ( &source, 1, target_rank, disp, 1, window ); +} template -void Rput( T source, int target_rank, Aint disp, - Window& window, Request& request ) -{ Rput ( &source, 1, target_rank, disp, 1, window, request ); } +void Rput( T source, int target_rank, Aint disp, + Window& window, Request& request ) +{ + Rput ( &source, 1, target_rank, disp, 1, window, request ); +} template void Rput (const byte source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Rput (const int source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Rput (const unsigned source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Rput (const long int source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Rput (const unsigned long source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG template void Rput (const long long int source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Rput (const unsigned long long source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); #endif template void Rput (const float source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Rput (const double source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Rput (const Complex source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Rput (const Complex source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Iput (const byte source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iput (const int source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iput (const unsigned source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iput (const long int source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iput (const unsigned long source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); #ifdef EL_HAVE_MPI_LONG_LONG template void Iput (const long long int source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iput (const unsigned long long source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); #endif template void Iput (const float source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iput (const double source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iput (const Complex source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iput (const Complex source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); // get template -void Iget (const R* source, int origin_count, int target_rank, +void Iget (R* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) #ifdef EL_ENSURE_GET_ATOMICITY SafeMpi (MPI_Get_accumulate - (NULL, 0, TypeMap(), const_cast(source), + (NULL, 0, TypeMap(), source, origin_count, TypeMap(), target_rank, disp, target_count, TypeMap(), MPI_NO_OP, window)); #else SafeMpi (MPI_Get - (const_cast(source), origin_count, TypeMap(), + (source, origin_count, TypeMap(), target_rank, disp, target_count, TypeMap(), window)); #endif } template -void Iget (const Complex* source, int origin_count, int target_rank, +void Iget (Complex* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) #ifdef EL_ENSURE_GET_ATOMICITY #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Get_accumulate - (NULL, 0, TypeMap(), const_cast*>(source), + (NULL, 0, TypeMap(), source, 2*origin_count, TypeMap(), target_rank, disp, 2*target_count, TypeMap(), MPI_NO_OP, window)); #else SafeMpi (MPI_Get_accumulate - (NULL, 0, TypeMap>(), const_cast*>(source), + (NULL, 0, TypeMap>(), source, origin_count, TypeMap>(), target_rank, disp, target_count, TypeMap>(), MPI_NO_OP, window)); @@ -753,55 +757,55 @@ void Iget (const Complex* source, int origin_count, int target_rank, #else #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Get - (const_cast*>(source), 2*origin_count, TypeMap(), + (source, 2*origin_count, TypeMap(), target_rank, disp, 2*target_count, TypeMap(), window)); #else SafeMpi (MPI_Get - (const_cast*>(source), origin_count, TypeMap>(), + (source, origin_count, TypeMap>(), target_rank, disp, target_count, TypeMap>(), window)); #endif #endif } - + template -void Rget (const R* source, int origin_count, int target_rank, +void Rget (R* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) #ifdef EL_ENSURE_GET_ATOMICITY SafeMpi (MPI_Rget_accumulate - (NULL, 0, TypeMap(), const_cast(source), + (NULL, 0, TypeMap(), source, origin_count, TypeMap(), target_rank, disp, target_count, - TypeMap(), MPI_NO_OP, window, - &request)); + TypeMap(), MPI_NO_OP, window, + &request)); #else SafeMpi (MPI_Rget - (const_cast(source), origin_count, TypeMap(), + (source, origin_count, TypeMap(), target_rank, disp, target_count, TypeMap(), window, &request)); #endif } template -void Rget (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, - Request & request) +void Rget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) #ifdef EL_ENSURE_GET_ATOMICITY #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Rget_accumulate - (NULL, 0, TypeMap(), const_cast*>(source), + (NULL, 0, TypeMap(), source, 2*origin_count, TypeMap(), target_rank, disp, 2*target_count, TypeMap(), MPI_NO_OP, window, &request)); #else SafeMpi (MPI_Rget_accumulate - (NULL, 0, TypeMap>(), const_cast*>(source), + (NULL, 0, TypeMap>(), source, origin_count, TypeMap>(), target_rank, disp, target_count, TypeMap>(), MPI_NO_OP, window, &request)); @@ -809,126 +813,130 @@ void Rget (const Complex* source, int origin_count, int target_rank, #else #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Rget - (const_cast*>(source), 2*origin_count, TypeMap(), + (source, 2*origin_count, TypeMap(), target_rank, disp, 2*target_count, TypeMap(), window, &request)); #else SafeMpi (MPI_Rget - (const_cast*>(source), origin_count, TypeMap>(), + (source, origin_count, TypeMap>(), target_rank, disp, target_count, TypeMap>(), window, &request)); #endif #endif } -template void Iget (const byte* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -template void Iget (const int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -template void Iget (const unsigned* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -template void Iget (const long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -template void Iget (const unsigned long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); +template void Iget (byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); #ifdef EL_HAVE_MPI_LONG_LONG -template void Iget (const long long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -template void Iget (const unsigned long long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -#endif -template void Iget (const float* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -template void Iget (const double* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -template void Iget (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); -template void Iget (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); - -template void Rget (const byte* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -template void Rget (const int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -template void Rget (const unsigned* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -template void Rget (const long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -template void Rget (const unsigned long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); +template void Iget (long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#endif +template void Iget (float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); + +template void Rget (byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void Rget (const long long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -template void Rget (const unsigned long long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -#endif -template void Rget (const float* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -template void Rget (const double* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -template void Rget (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); -template void Rget (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); +template void Rget (long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#endif +template void Rget (float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); // when source-target size == 1 template void Iget( T source, int target_rank, Aint disp, Window& window ) -{ Iget ( &source, 1, target_rank, disp, 1, window ); } +{ + Iget ( &source, 1, target_rank, disp, 1, window ); +} template -void Rget( T source, int target_rank, Aint disp, - Window& window, Request& request ) -{ Rget ( &source, 1, target_rank, disp, 1, window, request ); } - -template void Rget (const byte source, int target_rank, - Aint disp, Window & window, Request & request); -template void Rget (const int source, int target_rank, - Aint disp, Window & window, Request & request); -template void Rget (const unsigned source, int target_rank, - Aint disp, Window & window, Request & request); -template void Rget (const long int source, int target_rank, - Aint disp, Window & window, Request & request); -template void Rget (const unsigned long source, int target_rank, - Aint disp, Window & window, Request & request); +void Rget( T source, int target_rank, Aint disp, + Window& window, Request& request ) +{ + Rget ( &source, 1, target_rank, disp, 1, window, request ); +} + +template void Rget (byte source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (unsigned source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (unsigned long source, int target_rank, + Aint disp, Window & window, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void Rget (const long long int source, int target_rank, - Aint disp, Window & window, Request & request); -template void Rget (const unsigned long long source, int target_rank, - Aint disp, Window & window, Request & request); -#endif -template void Rget (const float source, int target_rank, - Aint disp, Window & window, Request & request); -template void Rget (const double source, int target_rank, - Aint disp, Window & window, Request & request); -template void Rget (const Complex source, int target_rank, - Aint disp, Window & window, Request & request); -template void Rget (const Complex source, int target_rank, - Aint disp, Window & window, Request & request); - -template void Iget (const byte source, int target_rank, - Aint disp, Window & window); -template void Iget (const int source, int target_rank, - Aint disp, Window & window); -template void Iget (const unsigned source, int target_rank, - Aint disp, Window & window); -template void Iget (const long int source, int target_rank, - Aint disp, Window & window); -template void Iget (const unsigned long source, int target_rank, - Aint disp, Window & window); +template void Rget (long long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (unsigned long long source, int target_rank, + Aint disp, Window & window, Request & request); +#endif +template void Rget (float source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (double source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (Complex source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (Complex source, int target_rank, + Aint disp, Window & window, Request & request); + +template void Iget (byte source, int target_rank, + Aint disp, Window & window); +template void Iget (int source, int target_rank, + Aint disp, Window & window); +template void Iget (unsigned source, int target_rank, + Aint disp, Window & window); +template void Iget (long int source, int target_rank, + Aint disp, Window & window); +template void Iget (unsigned long source, int target_rank, + Aint disp, Window & window); #ifdef EL_HAVE_MPI_LONG_LONG -template void Iget (const long long int source, int target_rank, - Aint disp, Window & window); -template void Iget (const unsigned long long source, int target_rank, - Aint disp, Window & window); -#endif -template void Iget (const float source, int target_rank, - Aint disp, Window & window); -template void Iget (const double source, int target_rank, - Aint disp, Window & window); -template void Iget (const Complex source, int target_rank, - Aint disp, Window & window); -template void Iget (const Complex source, int target_rank, - Aint disp, Window & window); +template void Iget (long long int source, int target_rank, + Aint disp, Window & window); +template void Iget (unsigned long long source, int target_rank, + Aint disp, Window & window); +#endif +template void Iget (float source, int target_rank, + Aint disp, Window & window); +template void Iget (double source, int target_rank, + Aint disp, Window & window); +template void Iget (Complex source, int target_rank, + Aint disp, Window & window); +template void Iget (Complex source, int target_rank, + Aint disp, Window & window); // acc template @@ -936,22 +944,11 @@ void Iacc (const R* source, int origin_count, int target_rank, Aint disp, int target_count, Op op, Window & window) { DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) - if (TypeMap() == TypeMap()) - { - SafeMpi (MPI_Accumulate - (const_cast(source), origin_count, - TypeMap(), target_rank, disp, - target_count, TypeMap(), op.op, - window)); - } - else - { - SafeMpi (MPI_Accumulate - (const_cast(source), origin_count, - TypeMap(), target_rank, disp, - target_count, TypeMap(), op.op, - window)); - } + SafeMpi (MPI_Accumulate + (source, origin_count, + TypeMap(), target_rank, disp, + target_count, TypeMap(), op.op, + window)); } template @@ -961,13 +958,13 @@ void Iacc (const Complex* source, int origin_count, int target_rank, DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Accumulate - (const_cast*>(source), 2*origin_count, + (source, 2*origin_count, TypeMap(), target_rank, disp, 2*target_count, TypeMap(), op.op, window)); #else SafeMpi (MPI_Accumulate - (const_cast*>(source), origin_count, + (source, origin_count, TypeMap>(), target_rank, disp, target_count, TypeMap>(), op.op, window)); @@ -980,11 +977,11 @@ void Racc (const R* source, int origin_count, int target_rank, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) - SafeMpi (MPI_Raccumulate - (const_cast(source), origin_count, - TypeMap(), target_rank, disp, - target_count, TypeMap(), op.op, - window, &request)); + SafeMpi (MPI_Raccumulate + (source, origin_count, + TypeMap(), target_rank, disp, + target_count, TypeMap(), op.op, + window, &request)); } template @@ -995,189 +992,197 @@ void Racc (const Complex* source, int origin_count, int target_rank, DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi (MPI_Raccumulate - (const_cast*>(source), 2*origin_count, + (source, 2*origin_count, TypeMap(), target_rank, disp, 2*target_count, TypeMap(), op.op, window, &request)); #else SafeMpi (MPI_Raccumulate - (const_cast*>(source), origin_count, + (source, origin_count, TypeMap>(), target_rank, disp, target_count, TypeMap>(), op.op, window, &request)); #endif } template void Iacc (const byte* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Iacc (const int* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Iacc (const unsigned* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Iacc (const long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Iacc (const unsigned long* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); #ifdef EL_HAVE_MPI_LONG_LONG template void Iacc (const long long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Iacc (const unsigned long long* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); #endif template void Iacc (const float* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Iacc (const double* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Iacc (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Iacc (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window); + Aint disp, int target_count, Op op, Window & window); template void Racc (const byte* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); template void Racc (const int* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); template void Racc (const unsigned* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); template void Racc (const long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); template void Racc (const unsigned long* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG template void Racc (const long long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); template void Racc (const unsigned long long* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); #endif template void Racc (const float* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); template void Racc (const double* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); template void Racc (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); template void Racc (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Op op, Window & window, Request & request); + Aint disp, int target_count, Op op, Window & window, Request & request); // op = SUM template void Iacc (const R* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window) -{ Iacc ( source, origin_count, target_rank, disp, target_count, SUM, window ); } +{ + Iacc ( source, origin_count, target_rank, disp, target_count, SUM, window ); +} template void Racc (const R* source, int origin_count, int target_rank, Aint disp, int target_count, Window & window, Request & request) -{ Racc ( source, origin_count, target_rank, disp, target_count, SUM, window, request ); } +{ + Racc ( source, origin_count, target_rank, disp, target_count, SUM, window, request ); +} template void Iacc (const byte* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iacc (const int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iacc (const unsigned* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iacc (const long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iacc (const unsigned long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); #ifdef EL_HAVE_MPI_LONG_LONG template void Iacc (const long long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iacc (const unsigned long long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); #endif template void Iacc (const float* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iacc (const double* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iacc (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Iacc (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window); + Aint disp, int target_count, Window & window); template void Racc (const byte* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Racc (const int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Racc (const unsigned* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Racc (const long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Racc (const unsigned long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG template void Racc (const long long int* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Racc (const unsigned long long* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); #endif template void Racc (const float* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Racc (const double* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Racc (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); template void Racc (const Complex* source, int origin_count, int target_rank, - Aint disp, int target_count, Window & window, Request & request); + Aint disp, int target_count, Window & window, Request & request); // when source-target size == 1 and op = SUM template void Iacc (const T source, int target_rank, Aint disp, Window & window) -{ Iacc ( &source, 1, target_rank, disp, 1, SUM, window ); } +{ + Iacc ( &source, 1, target_rank, disp, 1, SUM, window ); +} template void Racc (const T source, int target_rank, Aint disp, Window & window, Request & request) -{ Racc ( &source, 1, target_rank, disp, 1, SUM, window, request ); } +{ + Racc ( &source, 1, target_rank, disp, 1, SUM, window, request ); +} template void Racc (const byte source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Racc (const int source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Racc (const unsigned source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Racc (const long int source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Racc (const unsigned long source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG template void Racc (const long long int source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Racc (const unsigned long long source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); #endif template void Racc (const float source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Racc (const double source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Racc (const Complex source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Racc (const Complex source, int target_rank, - Aint disp, Window & window, Request & request); + Aint disp, Window & window, Request & request); template void Iacc (const byte source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iacc (const int source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iacc (const unsigned source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iacc (const long int source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iacc (const unsigned long source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); #ifdef EL_HAVE_MPI_LONG_LONG template void Iacc (const long long int source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iacc (const unsigned long long source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); #endif template void Iacc (const float source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iacc (const double source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iacc (const Complex source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); template void Iacc (const Complex source, int target_rank, - Aint disp, Window & window); + Aint disp, Window & window); // Synchronization // --------------- @@ -1686,7 +1691,7 @@ void TaggedISSend { DEBUG_ONLY (CallStackEntry cse ("mpi::ISSend")) - SafeMpi + SafeMpi (MPI_Issend (const_cast < R * >(buf), count, TypeMap < R > (), to, tag, comm.comm, From 47862e03e91052cffbf2d8246fe89ae76292c272 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 22 Jul 2014 10:03:04 -0500 Subject: [PATCH 040/110] turning off codepath not required for nb consensus --- src/core/AxpyInterface.cpp | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 1b19f31ebf..d17a73cef2 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -514,28 +514,24 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Int destination = receivingRow + r * receivingCol; const Int bufferSize = 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); -// TODO the size of request object is set in this function -// bypassing it means passing same request handle multiple -// times, we don't care about it in NbC version though(?) -//#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - //const Int numCreated = dataVectors_[destination].size(); -// const Int index = 0;//numCreated; - //dataVectors_[destination].resize (numCreated + 1); - //dataVectors_[numCreated].resize (bufferSize); -// dataVectors_[0].resize (bufferSize); - //dataSendRequests_[destination].push_back (mpi::REQUEST_NULL); - //sendingData_[destination].push_back (true); -//#else +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + const Int index = dataVectors_[destination].size(); + for (Int i = 0; i < index; ++i) + dataVectors_[destination][i].resize ( bufferSize ); + dataVectors_[destination].resize (index + 1); + dataVectors_[destination][index].resize ( bufferSize ); + mpi::Request dummy_request; +#else const Int index = ReadyForSend (bufferSize, dataVectors_[destination], dataSendRequests_[destination], sendingData_[destination]); -//#endif +#endif DEBUG_ONLY (if (Int (dataVectors_[destination][index].size ()) != bufferSize) LogicError ("Error in ReadyForSend");) - // Pack the header - byte *sendBuffer = dataVectors_[destination][index].data (); + // Pack the header + byte *sendBuffer = dataVectors_[destination][index].data (); byte *head = sendBuffer; *reinterpret_cast < Int * >(head) = i; head += sizeof (Int); @@ -560,13 +556,16 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) thisSendCol[s] = thisXCol[colShift + s * r]; } // Fire off the non-blocking send +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + mpi::TaggedISSend + (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), + dummy_request); + mpi::RequestFree (dummy_request); +#else mpi::TaggedISSend (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), dataSendRequests_[destination][index]); -//#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -// we won't use this request, so free it -// mpi::RequestFree (dataSendRequests_[destination][index]); -//#endif +#endif } #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = true; From 27c8da6823c425f716ac1c266f84e381a976b386 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 22 Jul 2014 20:56:10 -0500 Subject: [PATCH 041/110] some modifications for nb consensus --- include/El/core/AxpyInterface.hpp | 2 +- src/core/AxpyInterface.cpp | 29 +++++++++++++---------------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index c3cf52fbd6..7046d246c9 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -49,7 +49,7 @@ class AxpyInterface //request object for polling on Issends #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - bool all_sends_are_finished; + byte all_sends_are_finished; #endif bool attachedForLocalToGlobal_, attachedForGlobalToLocal_; diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index d17a73cef2..a0350c36e1 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -13,6 +13,8 @@ */ #include "El-lite.hpp" +// TODO Fix bug which causes deadlock in NBC version +// when for small AXPY_DIMs namespace El { @@ -112,9 +114,6 @@ namespace El if (mpi::IProbe (mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status)) { -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = true; -#endif // Message exists, so recv and pack const Int count = mpi::GetCount < byte > (status); DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) @@ -389,6 +388,9 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) globalToLocalMat_ = &Z; } +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + all_sends_are_finished = '0'; +#endif const Int p = Z.Grid ().Size (); sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); @@ -425,9 +427,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) attachedForGlobalToLocal_ = true; globalToLocalMat_ = &X; } -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = false; -#endif const Int p = X.Grid ().Size (); sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); @@ -484,9 +483,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (i + X.Height () > Y.Height () || j + X.Width () > Y.Width ()) LogicError ("Submatrix out of bounds of global matrix"); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = false; -#endif const Grid & g = Y.Grid (); const Int r = g.Height (); const Int c = g.Width (); @@ -567,9 +563,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) dataSendRequests_[destination][index]); #endif } -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = true; -#endif receivingRow = (receivingRow + 1) % r; if (receivingRow == 0) receivingCol = (receivingCol + 1) % c; @@ -742,15 +735,16 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Grid & g = (attachedForLocalToGlobal_ ? localToGlobalMat_->Grid () : globalToLocalMat_-> Grid ()); -// TODO Fix bug which causes deadlock in NBC version -// when for small AXPY_DIMs + if (attachedForLocalToGlobal_) { #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; - + // nonblocking ssends must have been issued + all_sends_are_finished = '1'; + // spin while (!DONE) { HandleLocalToGlobalData (); @@ -761,7 +755,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } else { - if (all_sends_are_finished) + if (all_sends_are_finished == '1') { // all ssends are complete, start nonblocking barrier mpi::IBarrier (g.VCComm (), nb_bar_request); @@ -788,6 +782,9 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) mpi::Barrier (g.VCComm ()); } +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + all_sends_are_finished = '0'; +#endif attachedForLocalToGlobal_ = false; attachedForGlobalToLocal_ = false; recvVector_.clear (); From ec43dd3c53a5971c560268fe51eacc003571d0bb Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 22 Jul 2014 23:47:31 -0500 Subject: [PATCH 042/110] restructure detach for nb consensus --- src/core/AxpyInterface.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index a0350c36e1..adf82e6425 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -389,7 +389,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = '0'; + //all_sends_are_finished = '0'; #endif const Int p = Z.Grid ().Size (); sentEomTo_.resize (p, false); @@ -747,7 +747,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) // spin while (!DONE) { - HandleLocalToGlobalData (); if (nb_bar_active) { // test/wait for IBarrier completion @@ -762,6 +761,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) nb_bar_active = true; } } + HandleLocalToGlobalData (); } #else while (!Finished ()) From b70731eaecdd249a99728fb2225c2aa1166c0dcf Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 23 Jul 2014 11:30:39 -0500 Subject: [PATCH 043/110] add a progress routine --- include/El/core/imports/mpi.hpp | 8 ++++---- src/core/RmaInterface.cpp | 10 +++++++++- src/core/imports/mpi.cpp | 7 +++++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 43a56a03bb..7bc180c5f6 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -196,10 +196,10 @@ void Translate // =============== #if MPI_VERSION>=3 // Utilities -void SetWindowProp (Window& window, int prop); -void CheckBounds (Window & window, Datatype win_type, Datatype type, -size_t count, ptrdiff_t target_offset); -//NOTE assuming MPI_MODE_NOCHECK +void SetWindowProp ( Window& window, int prop ); +void CheckBounds ( Window & window, Datatype win_type, Datatype type, +size_t count, ptrdiff_t target_offset ); +void RmaProgress ( Comm comm ); // Window creation/update/delete void WindowLock( int rank, Window& window ); void WindowLock( Window& window ); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 803004e923..e3831bcef5 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -242,6 +242,9 @@ void RmaInterface::Put( T scale, Matrix& Z, Int i, Int j ) if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } +#ifdef EL_EXPLICIT_PROGRESS + RmaProgress (g.VCComm ()); +#endif } template @@ -420,6 +423,9 @@ void RmaInterface::Acc( T scale, Matrix& Z, Int i, Int j ) if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } +#ifdef EL_EXPLICIT_PROGRESS + RmaProgress (g.VCComm ()); +#endif } template @@ -635,7 +641,9 @@ void RmaInterface::Detach() GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); - mpi::Barrier( g.VCComm() ); + // this is causing enormous slowdown + // due to load imbalance + //mpi::Barrier( g.VCComm() ); attached_ = false; detached_ = true; diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 48037147f7..6730b94113 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -495,6 +495,13 @@ void CheckBounds (Window & window, Datatype win_type, Datatype type, assert (displ + count*extent <= win_type_size); } +void RmaProgress ( Comm comm ) +{ + int flag; + SafeMpi (MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, + comm.comm, &flag, MPI_STATUS_IGNORE)); +} + void WindowFree (Window & window) { SafeMpi (MPI_Win_free (&window)); From 5fc36e94699546d4103904cacbe20d3fd5a5f224 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 23 Jul 2014 18:36:40 -0500 Subject: [PATCH 044/110] added strided/vector api, untested...removed scale from put --- include/El/core/RmaInterface.hpp | 11 ++- include/El/core/imports/mpi.hpp | 42 ++++++++-- src/core/RmaInterface.cpp | 12 ++- src/core/imports/mpi.cpp | 128 +++++++++++++++++++++++++++++++ 4 files changed, 174 insertions(+), 19 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 7f43a6bd95..2d3a3eae73 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -15,9 +15,8 @@ #ifndef EL_RMAINTERFACE_HPP #define EL_RMAINTERFACE_HPP -#if MPI_VERSION>=3 namespace El { - +#if MPI_VERSION>=3 template class RmaInterface { @@ -31,8 +30,8 @@ class RmaInterface void Attach( DistMatrix& Z ); void Attach( const DistMatrix& Z ); - void Put( T alpha, Matrix& Z, Int i, Int j ); - void Put( T alpha, const Matrix& Z, Int i, Int j ); + void Put( Matrix& Z, Int i, Int j ); + void Put( const Matrix& Z, Int i, Int j ); void Get( Matrix& Z, Int i, Int j ); void Get( const Matrix& Z, Int i, Int j ); @@ -57,7 +56,7 @@ class RmaInterface const DistMatrix* GlobalArrayGet_; bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; -}; + }; +#endif //MPI-3 } // namespace El -#endif #endif // ifndef EL_RMAINTERFACE_HPP diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 7bc180c5f6..896e308869 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -10,7 +10,7 @@ #pragma once #ifndef EL_IMPORTS_MPI_HPP #define EL_IMPORTS_MPI_HPP - +#include namespace El { namespace mpi { @@ -41,7 +41,14 @@ namespace mpi { #ifndef EL_MPI_EXPERIMENTAL #define EL_MPI_EXPERIMENTAL #endif - + +#ifndef EL_INT_SAFE_CAST +#define EL_INT_SAFE_CAST(x) \ + (x < std::numeric_limits::max () && \ + x > std::numeric_limits::min ())? \ + static_cast(x): (-99999) +#endif + struct Comm { MPI_Comm comm; @@ -88,10 +95,25 @@ typedef enum PARTIAL_ACC_ORDERING = 2, NO_ACC_ORDERING = 4 } acc_order_t; - -//TODO update these -const int MAX_OUTSTANDING_NB = 100000; -const int FLUSH_FREQUENCY = 10000; +// for ddt +typedef struct El_strided_s +{ + unsigned num; + size_t* sizes; + MPI_Aint* offsets; +} El_strided_t; +typedef struct El_iov_s +{ + unsigned count; + size_t* sizes; + MPI_Aint* offsets; +} El_iov_t; +typedef enum +{ + FIXED_BLOCK_FIXED_STRIDE = 1, + FIXED_BLOCK_VAR_STRIDE = 2, + UNKNOWN_BLOCK_STRIDE = 4 +} vector_pattern_t; #endif typedef MPI_Info Info; // Standard constants @@ -196,11 +218,19 @@ void Translate // =============== #if MPI_VERSION>=3 // Utilities +// --------- void SetWindowProp ( Window& window, int prop ); void CheckBounds ( Window & window, Datatype win_type, Datatype type, size_t count, ptrdiff_t target_offset ); void RmaProgress ( Comm comm ); +void StridedDatatype (El_strided_t* stride_descr, + Datatype old_type, Datatype* new_type, + size_t* source_dims); +void VectorDatatype (El_iov_t * vect_descr, + Datatype old_type, Datatype * new_type, + vector_pattern_t data_pattern); // Window creation/update/delete +// ----------------------------- void WindowLock( int rank, Window& window ); void WindowLock( Window& window ); void WindowUnlock( int rank, Window& window ); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index e3831bcef5..efcc45c21a 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -13,7 +13,7 @@ which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ #include "El-lite.hpp" -#include +#include // TODO Complete the const interfaces... // TODO RMA related checks pending (e.g bounds checking)... @@ -168,7 +168,7 @@ void RmaInterface::Attach( const DistMatrix& X ) } template -void RmaInterface::Put( T scale, Matrix& Z, Int i, Int j ) +void RmaInterface::Put( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) @@ -227,7 +227,7 @@ void RmaInterface::Put( T scale, Matrix& Z, Int i, Int j ) T* thisSendCol = &sendData[t*localHeight]; const T* thisXCol = &XBuffer[(rowShift+t*c)*XLDim]; for( Int s=0; s::Put( T scale, Matrix& Z, Int i, Int j ) } template -void RmaInterface::Put( T scale, const Matrix& Z, Int i, Int j ) +void RmaInterface::Put( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) } @@ -641,9 +641,7 @@ void RmaInterface::Detach() GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); - // this is causing enormous slowdown - // due to load imbalance - //mpi::Barrier( g.VCComm() ); + mpi::Barrier( g.VCComm() ); attached_ = false; detached_ = true; diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 6730b94113..5a4b98c0bf 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -501,6 +501,134 @@ void RmaProgress ( Comm comm ) SafeMpi (MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, comm.comm, &flag, MPI_STATUS_IGNORE)); } +// TODO these functions for DDT creation are +// completely untested +void StridedDatatype (El_strided_t* stride_descr, + Datatype old_type, Datatype* new_type, + size_t* source_dims) +{ + int old_type_size; + SafeMpi (MPI_Type_size (old_type, &old_type_size)); + int *dims = NULL, *sizes = NULL; + + // count of blocks must be non-zero + assert (stride_descr->num > 0); + // size is NULL + assert (stride_descr->sizes != NULL); + // offset is NULL + assert (stride_descr->offsets != NULL); + + // check for contiguous transfers + if ((source_dims == NULL) && (stride_descr->num == 1)) + { + int elem_count = stride_descr->sizes[0] / old_type_size; + // derived datatype is not a multiple of original type + assert ((stride_descr->sizes[0] % old_type_size == 0)); + SafeMpi ( MPI_Type_contiguous (elem_count, old_type, new_type) ); + return; + } + // offsets should be monotonic increasing + for (int i = 1; i < stride_descr->num; i++) + assert (stride_descr->offsets[i] >= stride_descr->offsets[i - 1]); + /* Notes: + * Sayan: This weird hack is because MPI_Type_create_subarray throws an error when + * stride_descr->sizes and source_dims is passed directly (probably type mismatch?) */ + /* heap */ + dims = new int[stride_descr->num]; + sizes = new int[stride_descr->num]; + + for (int i = 0; i < stride_descr->num; i++) + { + dims[i] = EL_INT_SAFE_CAST (source_dims[i]); + sizes[i] = EL_INT_SAFE_CAST (stride_descr->sizes[i]); + } + + SafeMpi ( MPI_Type_create_subarray (stride_descr->num, reinterpret_cast(dims), + reinterpret_cast(sizes), + reinterpret_cast(stride_descr->offsets), MPI_ORDER_C, + old_type, new_type) ); + + delete[] dims; + delete[] sizes; +} + +void VectorDatatype (El_iov_t * vect_descr, + Datatype old_type, Datatype * new_type, + vector_pattern_t data_pattern) +{ + int old_type_size; + int stride; + int fixed_block_fixed_stride = 1, // MPI_Type_vector + fixed_block_var_stride = 1; // MPI_Type_hindexed_block + /* defaults: + * var_block_var_stride=1 - MPI_Type_hindexed + * var_block_fixed_stride=1 - MPI_Type_hindexed + */ + SafeMpi ( MPI_Type_size (old_type, &old_type_size) ); + // count of blocks must be non-zero + assert (vect_descr->count > 0); + // size is NULL + assert (vect_descr->sizes != NULL); + // offset is NULL + assert (vect_descr->offsets != NULL); + // check for contiguous transfers + if (vect_descr->count == 1) + { + int elem_count = vect_descr->sizes[0] / old_type_size; + // derived datatype is not a multiple of original type + assert (vect_descr->sizes[0] % old_type_size == 0); + SafeMpi ( MPI_Type_contiguous (elem_count, old_type, new_type) ); + return; + } + // offsets should be monotonic increasing + for (int i = 1; i < vect_descr->count; i++) + assert (vect_descr->offsets[i] >= vect_descr->offsets[i - 1]); + + // identify the pattern of strides, fixed or varying + if (data_pattern == UNKNOWN_BLOCK_STRIDE) + { + stride = (vect_descr->offsets[1] - vect_descr->offsets[0]); + for (int i = 1; i < vect_descr->count; i++) + { + // check for fixed blocklengths and fixed strides + if ((vect_descr->sizes[i] == vect_descr->sizes[i - 1]) && + (stride == + (vect_descr->offsets[i] - vect_descr->offsets[i - 1]))) + fixed_block_fixed_stride++; + + // check for fixed blocklengths and variable strides + if ((vect_descr->sizes[i] == vect_descr->sizes[i - 1]) && + !(stride == + (vect_descr->offsets[i] - vect_descr->offsets[i - 1]))) + fixed_block_var_stride++; + } + } + + if (data_pattern == FIXED_BLOCK_FIXED_STRIDE) + fixed_block_fixed_stride = vect_descr->count; + + if (data_pattern == FIXED_BLOCK_VAR_STRIDE) + fixed_block_var_stride = vect_descr->count; + + // check if constant strides, if yes + // then create _type_vector, else + // _type_hindexed + if (fixed_block_fixed_stride == vect_descr->count) + { // _vector + int stride = ((vect_descr->offsets[1] - vect_descr->offsets[0]) + / old_type_size); + int blocklength = vect_descr->sizes[0]; + SafeMpi ( MPI_Type_vector (vect_descr->count, blocklength, + stride, old_type, new_type) ); + } + else if (fixed_block_var_stride == vect_descr->count) // _hindexed_block + SafeMpi ( MPI_Type_create_hindexed_block (vect_descr->count, vect_descr->sizes[0], + vect_descr->offsets, old_type, new_type) ); + else // _hindexed + SafeMpi ( MPI_Type_create_hindexed (vect_descr->count, + (const int *) vect_descr->sizes, + vect_descr->offsets, old_type, new_type) ); +} void WindowFree (Window & window) { From f76e5072da43b7ff96982509e53d8d4904acadb9 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 24 Jul 2014 11:09:17 -0500 Subject: [PATCH 045/110] committing test codes, further updates upon testing would ensue --- tests/core/HFsimul.cpp | 164 ++++++++++++++++++++++++++++++++++++ tests/core/RmaInterface.cpp | 35 +++++--- 2 files changed, 187 insertions(+), 12 deletions(-) create mode 100644 tests/core/HFsimul.cpp diff --git a/tests/core/HFsimul.cpp b/tests/core/HFsimul.cpp new file mode 100644 index 0000000000..8b59b1a8e5 --- /dev/null +++ b/tests/core/HFsimul.cpp @@ -0,0 +1,164 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then Flush all, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 10 +//#define DIM 1000 +//#define AXPY_DIM 100 +#define DIM 20 +#define AXPY_DIM 4 +#define ALPHA 2.0 +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + RmaInterface < double > Rmaint; + Rmaint.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Acc (ALPHA, B, i, j); +#if DEBUG > 2 + std::cout << "[" << commRank << "]: AXPY on patch - " << i << " , " << j; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Flush all operations from B to DistMatrix + Rmaint.Flush ( B ); + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Get (C, i, j); +#if DEBUG > 2 + std::cout << "[" << commRank << "]: GET from patch - " << i << " , " << j; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Get doesn't require flush + // Collectively detach in order to finish filling process 0's request + Rmaint.Detach (); + + if (DIM <= 20) + Print (A, "Updated distributed A"); + // Process 0 can now locally print its copy of A + if (grid.VCRank () == 0 && DIM <= 20) + Print (C, "Process 0's local copy of A"); + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} diff --git a/tests/core/RmaInterface.cpp b/tests/core/RmaInterface.cpp index 2ad0b796e2..f511f28c62 100644 --- a/tests/core/RmaInterface.cpp +++ b/tests/core/RmaInterface.cpp @@ -13,7 +13,7 @@ using namespace El; #define ITER 10 //#define DIM 1000 -//#define AXPY_DIM 20 +//#define AXPY_DIM 100 #define DIM 20 #define AXPY_DIM 4 #define ALPHA 2.0 @@ -25,6 +25,7 @@ main (int argc, char *argv[]) mpi::Comm comm = mpi::COMM_WORLD; const Int commRank = mpi::Rank (comm); const Int commSize = mpi::Size (comm); + double t1, t2, seconds; assert (AXPY_DIM < DIM); @@ -42,6 +43,7 @@ main (int argc, char *argv[]) if (DIM <= 20) Print (A, "Original distributed A"); + t1 = MPI_Wtime(); for (Int k = 0; k < ITER; ++k) { if (commRank == 0) @@ -58,17 +60,14 @@ main (int argc, char *argv[]) // desire at this point. if (grid.VCRank () == 0) { - mpi::Op op; - op.op = MPI_SUM; Matrix < double >B (AXPY_DIM, AXPY_DIM); Identity (B, AXPY_DIM, AXPY_DIM); - // AXPY is scaled accumulate as in ARMCI - Rmaint.Acc (ALPHA, B, op, (DIM - AXPY_DIM), (DIM - AXPY_DIM)); + //Print (B, "Original B"); + // AXPY is scaled accumulate as in ARMCI + Rmaint.Acc (ALPHA, B, (DIM - AXPY_DIM), (DIM - AXPY_DIM)); Rmaint.Flush (B, (DIM - AXPY_DIM), (DIM - AXPY_DIM)); + //Print (B, "Updated B"); } - if (DIM <= 20) - Print (A, "Updated distributed A"); - // Have process 0 request a copy of the entire distributed matrix // // NOTE: Every process is free to Axpy as many submatrices as they // desire at this point. @@ -77,15 +76,27 @@ main (int argc, char *argv[]) { Zeros (C, DIM, DIM); Rmaint.Get (C, 0, 0); - Rmaint.Flush (C); + Rmaint.Flush ( C ); } - // Process 0 can now locally print its copy of A - if (grid.VCRank () == 0 && DIM <= 20) - Print (C, "Process 0's local copy of A"); + // Collectively detach in order to finish filling process 0's request Rmaint.Detach (); + + if (DIM <= 20) + Print (A, "Updated distributed A"); + // Process 0 can now locally print its copy of A + if (grid.VCRank () == 0 && DIM <= 20) + Print (C, "Process 0's local copy of A"); } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); } catch (std::exception & e) { From 5de32c42f8c2408c60a431cd4ebbb05c481947c1 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 24 Jul 2014 15:49:34 -0500 Subject: [PATCH 046/110] some more testing, better debug prints --- tests/core/HFsimul.cpp | 44 ++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/tests/core/HFsimul.cpp b/tests/core/HFsimul.cpp index 8b59b1a8e5..1761497023 100644 --- a/tests/core/HFsimul.cpp +++ b/tests/core/HFsimul.cpp @@ -25,11 +25,15 @@ #include using namespace El; -#define ITER 10 +//#define ITER 10 +#define ITER 1 //#define DIM 1000 //#define AXPY_DIM 100 -#define DIM 20 -#define AXPY_DIM 4 +//#define DIM 20 +//#define AXPY_DIM 4 +#define DIM 8 +#define AXPY_DIM 2 + #define ALPHA 2.0 #define FOP_ROOT 0 @@ -103,7 +107,9 @@ int main (int argc, char *argv[]) { Rmaint.Acc (ALPHA, B, i, j); #if DEBUG > 2 - std::cout << "[" << commRank << "]: AXPY on patch - " << i << " , " << j; + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; #endif } counter = ReadInc (win, 0, (long) 1); @@ -124,7 +130,9 @@ int main (int argc, char *argv[]) { Rmaint.Get (C, i, j); #if DEBUG > 2 - std::cout << "[" << commRank << "]: GET from patch - " << i << " , " << j; + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; #endif } counter = ReadInc (win, 0, (long) 1); @@ -135,11 +143,27 @@ int main (int argc, char *argv[]) // Collectively detach in order to finish filling process 0's request Rmaint.Detach (); - if (DIM <= 20) - Print (A, "Updated distributed A"); - // Process 0 can now locally print its copy of A - if (grid.VCRank () == 0 && DIM <= 20) - Print (C, "Process 0's local copy of A"); +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of A"); + } + } + mpi::Barrier ( comm ); +#endif } t2 = MPI_Wtime(); seconds = (t2 - t1); ///ITER; From 9f29de033698749afaf1d5e817e04a8b2d319146 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 24 Jul 2014 16:57:27 -0500 Subject: [PATCH 047/110] added HF proxy using original AxpyInterface, got to test well --- tests/core/HFsimulAxpyInt.cpp | 191 ++++++++++++++++++++++++++++++++++ tests/core/HFsimulRMAInt.cpp | 188 +++++++++++++++++++++++++++++++++ 2 files changed, 379 insertions(+) create mode 100644 tests/core/HFsimulAxpyInt.cpp create mode 100644 tests/core/HFsimulRMAInt.cpp diff --git a/tests/core/HFsimulAxpyInt.cpp b/tests/core/HFsimulAxpyInt.cpp new file mode 100644 index 0000000000..293637853e --- /dev/null +++ b/tests/core/HFsimulAxpyInt.cpp @@ -0,0 +1,191 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This also requires MPI-3, as we have used + * MPI-3 fetch and op to simulate a global + * counter. We could later use MPI-2 version of + * this function if necessary. + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 1 +//#define DIM 1000 +//#define AXPY_DIM 100 +//#define DIM 20 +//#define AXPY_DIM 4 +#define DIM 8 +#define AXPY_DIM 2 + +#define ALPHA 2.0 +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + MPI_Win win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + // Open up a LOCAL_TO_GLOBAL interface to A + AxpyInterface < double >interface; + interface.Attach (LOCAL_TO_GLOBAL, A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + interface.Axpy (ALPHA, B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + + interface.Detach (); + +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } +#endif + // Reattach to A, but in the GLOBAL_TO_LOCAL direction + interface.Attach (GLOBAL_TO_LOCAL, A); + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + // Bring my updated patch to me from DistMatrix + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + interface.Axpy (1.0, C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + interface.Detach (); +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of A"); + } + } +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} diff --git a/tests/core/HFsimulRMAInt.cpp b/tests/core/HFsimulRMAInt.cpp new file mode 100644 index 0000000000..1761497023 --- /dev/null +++ b/tests/core/HFsimulRMAInt.cpp @@ -0,0 +1,188 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then Flush all, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + */ +#include "El.hpp" +#include +using namespace El; + +//#define ITER 10 +#define ITER 1 +//#define DIM 1000 +//#define AXPY_DIM 100 +//#define DIM 20 +//#define AXPY_DIM 4 +#define DIM 8 +#define AXPY_DIM 2 + +#define ALPHA 2.0 +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + RmaInterface < double > Rmaint; + Rmaint.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Acc (ALPHA, B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Flush all operations from B to DistMatrix + Rmaint.Flush ( B ); + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Get (C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Get doesn't require flush + // Collectively detach in order to finish filling process 0's request + Rmaint.Detach (); + +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of A"); + } + } + mpi::Barrier ( comm ); +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} From c6a067b39eabb9bfc0896e8c70d5a2ad82bd80ed Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 24 Jul 2014 17:39:16 -0500 Subject: [PATCH 048/110] following AXPYinterface syntax --- tests/core/HFsimulRMAInt.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/core/HFsimulRMAInt.cpp b/tests/core/HFsimulRMAInt.cpp index 1761497023..e443ec77fc 100644 --- a/tests/core/HFsimulRMAInt.cpp +++ b/tests/core/HFsimulRMAInt.cpp @@ -20,12 +20,12 @@ * in El, hence this test mixes MPI routines * and MPI from El. This is nasty, but at one * point would be made better. + * This is implemented using MPI-3 */ #include "El.hpp" #include using namespace El; -//#define ITER 10 #define ITER 1 //#define DIM 1000 //#define AXPY_DIM 100 @@ -128,7 +128,8 @@ int main (int argc, char *argv[]) { for (int j = 0; j < DIM; j += AXPY_DIM) { - Rmaint.Get (C, i, j); + //Rmaint.Get (C, i, j); + Rmaint.Acc (1.0, C, i, j); #if DEBUG > 2 std::cout << std::to_string(commRank) + ": GET patch: " + std::to_string(i) + "," + std::to_string(j) @@ -140,6 +141,7 @@ int main (int argc, char *argv[]) next++; } // Get doesn't require flush + Rmaint.Flush ( C ); // Collectively detach in order to finish filling process 0's request Rmaint.Detach (); From 79cddb92b1c3aebcd273d393546e941e8d16976f Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Fri, 25 Jul 2014 15:05:17 -0500 Subject: [PATCH 049/110] exploring an alternate design of nbc, where notification is sent immediately after sends, as opposed to setting flag in detach. --- include/El/core/AxpyInterface.hpp | 7 ++++++- src/core/AxpyInterface.cpp | 20 +++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index 7046d246c9..6ab117a1fd 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -45,7 +45,12 @@ class AxpyInterface DATA_TAG =1, EOM_TAG =2, DATA_REQUEST_TAG=3, - DATA_REPLY_TAG =4; + DATA_REPLY_TAG =4 +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + ,ALL_ISSENDS_FINISHED =5; +#else + ; +#endif //request object for polling on Issends #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index adf82e6425..03e1f948b0 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -567,6 +567,21 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (receivingRow == 0) receivingCol = (receivingCol + 1) % c; } +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + // nonblocking ssends have been issued + // send all PEs + // this block might be invoked by multiple + // processes + for (Int rank = 0; rank < p; rank++) + { + mpi::Request _request; + byte sends_are_finished = '1'; + mpi::TaggedISSend + (&sends_are_finished, sizeof(byte), rank, ALL_ISSENDS_FINISHED, g.VCComm (), + _request); + mpi::RequestFree (_request); + } +#endif } // Update Y += alpha X(i:i+height-1,j:j+width-1), where X is the dist-matrix @@ -739,11 +754,14 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (attachedForLocalToGlobal_) { #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + // recv messages with ALL_ISSENDS_FINISHED tag + TaggedRecv (&all_sends_are_finished, 1, mpi::ANY_SOURCE, + ALL_ISSENDS_FINISHED, g.VCComm ()); bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; // nonblocking ssends must have been issued - all_sends_are_finished = '1'; + //all_sends_are_finished = '1'; // spin while (!DONE) { From cbd7ff5bf8bc2e5e0d3580196de667976819cd96 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 28 Jul 2014 00:14:33 -0500 Subject: [PATCH 050/110] made a modification in nb consensus, unless there is a strong reason to send to all PEs a notification about issends being complete and recv before doing anything in detach, not doing it. Furthermore, I am sure it defeats the purpose of NbC. --- src/core/AxpyInterface.cpp | 69 +++++++++++++++++++----------------- tests/core/HFsimulRMAInt.cpp | 6 ++-- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 03e1f948b0..333dc5e182 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -13,11 +13,8 @@ */ #include "El-lite.hpp" -// TODO Fix bug which causes deadlock in NBC version -// when for small AXPY_DIMs namespace El { - template < typename T > bool AxpyInterface < T >::Finished () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Finished"); @@ -387,27 +384,26 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) attachedForGlobalToLocal_ = true; globalToLocalMat_ = &Z; } - #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - //all_sends_are_finished = '0'; + all_sends_are_finished = '0'; #endif const Int p = Z.Grid ().Size (); + sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); - - sendingData_.resize (p); sendingRequest_.resize (p); - sendingReply_.resize (p); - - dataVectors_.resize (p); - requestVectors_.resize (p); - replyVectors_.resize (p); - dataSendRequests_.resize (p); + + eomSendRequests_.resize (p); requestSendRequests_.resize (p); replySendRequests_.resize (p); + requestVectors_.resize (p); - eomSendRequests_.resize (p); + sendingData_.resize (p); + sendingReply_.resize (p); + + dataVectors_.resize (p); + replyVectors_.resize (p); } template < typename T > @@ -427,24 +423,28 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) attachedForGlobalToLocal_ = true; globalToLocalMat_ = &X; } +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + all_sends_are_finished = '0'; +#endif const Int p = X.Grid ().Size (); + sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); - - sendingData_.resize (p); + + dataSendRequests_.resize (p); + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + eomSendRequests_.resize (p); + + requestVectors_.resize (p); sendingRequest_.resize (p); + + sendingData_.resize (p); sendingReply_.resize (p); dataVectors_.resize (p); - requestVectors_.resize (p); replyVectors_.resize (p); - - dataSendRequests_.resize (p); - requestSendRequests_.resize (p); - replySendRequests_.resize (p); - - eomSendRequests_.resize (p); - } + } template < typename T > void AxpyInterface < T >::Axpy (T alpha, Matrix < T > &Z, Int i, Int j) @@ -572,15 +572,16 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) // send all PEs // this block might be invoked by multiple // processes + /* for (Int rank = 0; rank < p; rank++) { - mpi::Request _request; + mpi::Request _request = mpi::REQUEST_NULL; byte sends_are_finished = '1'; mpi::TaggedISSend (&sends_are_finished, sizeof(byte), rank, ALL_ISSENDS_FINISHED, g.VCComm (), _request); - mpi::RequestFree (_request); } + */ #endif } @@ -606,10 +607,10 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) for (Int rank = 0; rank < p; ++rank) { const Int bufferSize = 4 * sizeof (Int); + const Int index = ReadyForSend (bufferSize, requestVectors_[rank], requestSendRequests_[rank], sendingRequest_[rank]); - // Copy the request header into the send buffer byte *sendBuffer = requestVectors_[rank][index].data (); byte *head = sendBuffer; @@ -755,16 +756,21 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) { #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) // recv messages with ALL_ISSENDS_FINISHED tag - TaggedRecv (&all_sends_are_finished, 1, mpi::ANY_SOURCE, - ALL_ISSENDS_FINISHED, g.VCComm ()); + //TaggedRecv (&all_sends_are_finished, 1, mpi::ANY_SOURCE, + // ALL_ISSENDS_FINISHED, g.VCComm ()); bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; // nonblocking ssends must have been issued - //all_sends_are_finished = '1'; - // spin + all_sends_are_finished = '1'; + // spin till all messages sent have been + // received while (!DONE) { + // probes for incoming message and + // receive + HandleLocalToGlobalData (); + if (nb_bar_active) { // test/wait for IBarrier completion @@ -779,7 +785,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) nb_bar_active = true; } } - HandleLocalToGlobalData (); } #else while (!Finished ()) diff --git a/tests/core/HFsimulRMAInt.cpp b/tests/core/HFsimulRMAInt.cpp index e443ec77fc..72c73626e9 100644 --- a/tests/core/HFsimulRMAInt.cpp +++ b/tests/core/HFsimulRMAInt.cpp @@ -128,8 +128,8 @@ int main (int argc, char *argv[]) { for (int j = 0; j < DIM; j += AXPY_DIM) { - //Rmaint.Get (C, i, j); - Rmaint.Acc (1.0, C, i, j); + Rmaint.Get (C, i, j); + //Rmaint.Acc (1.0, C, i, j); #if DEBUG > 2 std::cout << std::to_string(commRank) + ": GET patch: " + std::to_string(i) + "," + std::to_string(j) @@ -141,7 +141,7 @@ int main (int argc, char *argv[]) next++; } // Get doesn't require flush - Rmaint.Flush ( C ); + //Rmaint.Flush ( C ); // Collectively detach in order to finish filling process 0's request Rmaint.Detach (); From e88c84036f886a28aa80a35c7cef8da93da79ea3 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 28 Jul 2014 09:36:41 -0500 Subject: [PATCH 051/110] commented out some of the stuff I had for nbc, it will get completely removed eventually....added a LocalAcc function to simulate AxpyGLOBAL_TO_LOCAL...modified the RMA test so that it uses LocalAcc instead of Get, we will use it interchangeably according to the test needed --- include/El/core/AxpyInterface.hpp | 12 ++--- include/El/core/RmaInterface.hpp | 1 + include/El/core/imports/mpi.hpp | 7 +++ src/core/RmaInterface.cpp | 84 +++++++++++++++++++++++++++++++ tests/core/HFsimulRMAInt.cpp | 6 +-- 5 files changed, 100 insertions(+), 10 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index 6ab117a1fd..c30f5cc018 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -45,12 +45,12 @@ class AxpyInterface DATA_TAG =1, EOM_TAG =2, DATA_REQUEST_TAG=3, - DATA_REPLY_TAG =4 -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - ,ALL_ISSENDS_FINISHED =5; -#else - ; -#endif + DATA_REPLY_TAG =4; +//#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + //,ALL_ISSENDS_FINISHED =5; +//#else +// ; +//#endif //request object for polling on Issends #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 2d3a3eae73..0fbe8d883e 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -38,6 +38,7 @@ class RmaInterface void Acc( T alpha, Matrix& Z, Int i, Int j ); void Acc( T alpha, const Matrix& Z, Int i, Int j ); + void LocalAcc( T alpha, Matrix& Z, Int i, Int j ); void Flush( Matrix& Z, Int i, Int j ); void Flush( const Matrix& Z, Int i, Int j ); diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 896e308869..d06a94c4f7 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -42,6 +42,12 @@ namespace mpi { #define EL_MPI_EXPERIMENTAL #endif +// Use derived datatypes for strided +// vector communication patterns +#ifndef EL_USE_DERIVED_DATATYPE +#define EL_USE_DERIVED_DATATYPE +#endif + #ifndef EL_INT_SAFE_CAST #define EL_INT_SAFE_CAST(x) \ (x < std::numeric_limits::max () && \ @@ -223,6 +229,7 @@ void SetWindowProp ( Window& window, int prop ); void CheckBounds ( Window & window, Datatype win_type, Datatype type, size_t count, ptrdiff_t target_offset ); void RmaProgress ( Comm comm ); +// strided/vector to datatype void StridedDatatype (El_strided_t* stride_descr, Datatype old_type, Datatype* new_type, size_t* source_dims); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index efcc45c21a..adb800e93f 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -434,6 +434,90 @@ void RmaInterface::Acc( T scale, const Matrix& Z, Int i, Int j ) DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) } +// scaled local accumulate, Z += scale * Get Y(i:i+height-1,j:j+width-1), +// where Z is local matrix height x width +template +void RmaInterface::LocalAcc( T scale, Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::LocalAcc")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + + const DistMatrix &X = *GlobalArrayGet_; + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError("Submatrix out of bounds of global matrix"); + + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; + + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + + const Int XLDim = X.LDim (); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step(getBuffer); + // update local matrix + for( Int t=0; t void RmaInterface::Flush( Matrix& Z, Int i, Int j ) { diff --git a/tests/core/HFsimulRMAInt.cpp b/tests/core/HFsimulRMAInt.cpp index 72c73626e9..a965b16d7f 100644 --- a/tests/core/HFsimulRMAInt.cpp +++ b/tests/core/HFsimulRMAInt.cpp @@ -128,8 +128,8 @@ int main (int argc, char *argv[]) { for (int j = 0; j < DIM; j += AXPY_DIM) { - Rmaint.Get (C, i, j); - //Rmaint.Acc (1.0, C, i, j); + //Rmaint.Get (C, i, j); + Rmaint.LocalAcc (1.0, C, i, j); #if DEBUG > 2 std::cout << std::to_string(commRank) + ": GET patch: " + std::to_string(i) + "," + std::to_string(j) @@ -140,8 +140,6 @@ int main (int argc, char *argv[]) } next++; } - // Get doesn't require flush - //Rmaint.Flush ( C ); // Collectively detach in order to finish filling process 0's request Rmaint.Detach (); From c8ac446ee9458d2a11d01433a9bf8a0a9d2b1dbd Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 28 Jul 2014 21:10:55 -0500 Subject: [PATCH 052/110] intermediate commit, removed code paths that required request objects for nbc version (to save memory), updated rma tests --- include/El/core/AxpyInterface.hpp | 32 +++--- src/core/AxpyInterface.cpp | 164 ++++++++++++++++++++---------- src/core/RmaInterface.cpp | 2 +- tests/core/HFsimulAxpyInt.cpp | 25 ++++- tests/core/HFsimulRMAInt.cpp | 33 +++--- 5 files changed, 171 insertions(+), 85 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index c30f5cc018..29473c3e29 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -46,11 +46,6 @@ class AxpyInterface EOM_TAG =2, DATA_REQUEST_TAG=3, DATA_REPLY_TAG =4; -//#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - //,ALL_ISSENDS_FINISHED =5; -//#else -// ; -//#endif //request object for polling on Issends #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) @@ -61,38 +56,45 @@ class AxpyInterface DistMatrix* localToGlobalMat_; const DistMatrix* globalToLocalMat_; +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else std::vector sentEomTo_, haveEomFrom_; - std::vector recvVector_; std::vector eomSendRequests_; - std::vector>> - dataVectors_, requestVectors_, replyVectors_; std::vector> sendingData_, sendingRequest_, sendingReply_; std::vector> dataSendRequests_, requestSendRequests_, replySendRequests_; - +#endif + + std::vector recvVector_; + std::vector>> + dataVectors_, requestVectors_, replyVectors_; + byte sendDummy_, recvDummy_; +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else // Check if we are done with this attachment's work bool Finished(); - // Progress functions void UpdateRequestStatuses(); void HandleEoms(); - void HandleLocalToGlobalData(); - void HandleGlobalToLocalRequest(); void StartSendingEoms(); void FinishSendingEoms(); - void AxpyLocalToGlobal( T alpha, const Matrix& X, Int i, Int j ); - void AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ); - Int ReadyForSend ( Int sendSize, std::deque>& sendVectors, std::deque& requests, std::deque& requestStatuses ); +#endif + + void HandleLocalToGlobalData(); + void HandleGlobalToLocalRequest(); + + void AxpyLocalToGlobal( T alpha, const Matrix& X, Int i, Int j ); + void AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ); }; } // namespace El diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 333dc5e182..beec459105 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -15,6 +15,8 @@ namespace El { +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else template < typename T > bool AxpyInterface < T >::Finished () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Finished"); @@ -97,6 +99,7 @@ namespace El haveEomFrom_[source] = true; } } +#endif // NO USE OF THESE P2P SYNC FUNCTIONS WHEN NBC IS ACTIVE template < typename T > void AxpyInterface < T >::HandleLocalToGlobalData () { @@ -228,10 +231,18 @@ namespace El const Int numEntries = localHeight * localWidth; const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + const Int index = replyVectors_[source].size(); + for (Int i = 0; i < index; ++i) + replyVectors_[source][i].resize ( bufferSize ); + replyVectors_[source].resize (index + 1); + replyVectors_[source][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; +#else const Int index = ReadyForSend (bufferSize, replyVectors_[source], replySendRequests_[source], sendingReply_[source]); - +#endif // Pack the reply header byte *sendBuffer = replyVectors_[source][index].data (); byte *sendHead = sendBuffer; @@ -250,9 +261,15 @@ namespace El } // Fire off non-blocking send +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + mpi::TaggedISSend + (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), + dummy_request); +#else mpi::TaggedISSend (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), replySendRequests_[source][index]); +#endif } } @@ -284,6 +301,9 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) } const Int p = Z.Grid().Size(); + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else sentEomTo_.resize( p, false ); haveEomFrom_.resize( p, false ); @@ -291,15 +311,16 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) sendingRequest_.resize( p ); sendingReply_.resize( p ); - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); - dataSendRequests_.resize( p ); requestSendRequests_.resize( p ); replySendRequests_.resize( p ); eomSendRequests_.resize( p ); +#endif + + dataVectors_.resize( p ); + requestVectors_.resize( p ); + replyVectors_.resize( p ); } template @@ -320,6 +341,8 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } const Int p = X.Grid ().Size (); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); @@ -327,16 +350,17 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) sendingRequest_.resize (p); sendingReply_.resize (p); - dataVectors_.resize (p); - requestVectors_.resize (p); - replyVectors_.resize (p); - dataSendRequests_.resize (p); requestSendRequests_.resize (p); replySendRequests_.resize (p); eomSendRequests_.resize (p); - } +#endif + + dataVectors_.resize (p); + requestVectors_.resize (p); + replyVectors_.resize (p); + } template < typename T > AxpyInterface < T >::~AxpyInterface () { @@ -389,20 +413,24 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) #endif const Int p = Z.Grid ().Size (); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + // eom sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); + // request objects sendingRequest_.resize (p); dataSendRequests_.resize (p); - eomSendRequests_.resize (p); + // ready-to-send requestSendRequests_.resize (p); replySendRequests_.resize (p); - requestVectors_.resize (p); - sendingData_.resize (p); sendingReply_.resize (p); - - dataVectors_.resize (p); +#endif + + dataVectors_.resize (p); + requestVectors_.resize (p); replyVectors_.resize (p); } @@ -427,23 +455,26 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) all_sends_are_finished = '0'; #endif const Int p = X.Grid ().Size (); - + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + // eom sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); - + // request objects dataSendRequests_.resize (p); requestSendRequests_.resize (p); replySendRequests_.resize (p); eomSendRequests_.resize (p); - - requestVectors_.resize (p); + // ready-to-send sendingRequest_.resize (p); - sendingData_.resize (p); sendingReply_.resize (p); +#endif dataVectors_.resize (p); replyVectors_.resize (p); + requestVectors_.resize (p); } template < typename T > @@ -516,7 +547,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) dataVectors_[destination][i].resize ( bufferSize ); dataVectors_[destination].resize (index + 1); dataVectors_[destination][index].resize ( bufferSize ); - mpi::Request dummy_request; + mpi::Request dummy_request = mpi::REQUEST_NULL; #else const Int index = ReadyForSend (bufferSize, dataVectors_[destination], @@ -556,7 +587,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) mpi::TaggedISSend (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), dummy_request); - mpi::RequestFree (dummy_request); #else mpi::TaggedISSend (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), @@ -567,22 +597,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (receivingRow == 0) receivingCol = (receivingCol + 1) % c; } -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - // nonblocking ssends have been issued - // send all PEs - // this block might be invoked by multiple - // processes - /* - for (Int rank = 0; rank < p; rank++) - { - mpi::Request _request = mpi::REQUEST_NULL; - byte sends_are_finished = '1'; - mpi::TaggedISSend - (&sends_are_finished, sizeof(byte), rank, ALL_ISSENDS_FINISHED, g.VCComm (), - _request); - } - */ -#endif } // Update Y += alpha X(i:i+height-1,j:j+width-1), where X is the dist-matrix @@ -607,10 +621,18 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) for (Int rank = 0; rank < p; ++rank) { const Int bufferSize = 4 * sizeof (Int); - +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + const Int index = requestVectors_[rank].size(); + for (Int i = 0; i < index; ++i) + requestVectors_[rank][i].resize ( bufferSize ); + requestVectors_[rank].resize (index + 1); + requestVectors_[rank][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; +#else const Int index = ReadyForSend (bufferSize, requestVectors_[rank], requestSendRequests_[rank], sendingRequest_[rank]); +#endif // Copy the request header into the send buffer byte *sendBuffer = requestVectors_[rank][index].data (); byte *head = sendBuffer; @@ -624,11 +646,17 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) head += sizeof (Int); // Begin the non-blocking send +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + mpi::TaggedISSend + (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), + dummy_request); +#else mpi::TaggedISSend (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), requestSendRequests_[rank][index]); +#endif } - + // Receive all of the replies Int numReplies = 0; while (numReplies < p) @@ -680,7 +708,9 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } } - template < typename T > +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + template < typename T > Int AxpyInterface < T >::ReadyForSend (Int sendSize, std::deque < std::vector < byte >> &sendVectors, @@ -716,7 +746,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) return numCreated; } - + template < typename T > void AxpyInterface < T >::UpdateRequestStatuses () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) @@ -741,7 +771,8 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); } } - +#endif // NO USE OF THESE P2P SYNC FUNCTIONS WHEN NBC IS ACTIVE + template < typename T > void AxpyInterface < T >::Detach () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Detach")) @@ -755,9 +786,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) if (attachedForLocalToGlobal_) { #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - // recv messages with ALL_ISSENDS_FINISHED tag - //TaggedRecv (&all_sends_are_finished, 1, mpi::ANY_SOURCE, - // ALL_ISSENDS_FINISHED, g.VCComm ()); bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; @@ -797,12 +825,43 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } else { +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + // nonblocking ssends must have been issued + all_sends_are_finished = '1'; + // spin till all messages sent have been + // received + while (!DONE) + { + // probes for incoming message requests + // receives, and posts reply + HandleGlobalToLocalRequest (); + + if (nb_bar_active) + { + // test/wait for IBarrier completion + DONE = mpi::Test (nb_bar_request); + } + else + { + if (all_sends_are_finished == '1') + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier (g.VCComm (), nb_bar_request); + nb_bar_active = true; + } + } + } +#else while (!Finished ()) { HandleGlobalToLocalRequest (); HandleEoms (); } mpi::Barrier (g.VCComm ()); +#endif } #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) @@ -812,22 +871,23 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) attachedForGlobalToLocal_ = false; recvVector_.clear (); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else sentEomTo_.clear (); haveEomFrom_.clear (); sendingData_.clear (); sendingRequest_.clear (); sendingReply_.clear (); - - dataVectors_.clear (); - requestVectors_.clear (); - replyVectors_.clear (); - + dataSendRequests_.clear (); requestSendRequests_.clear (); replySendRequests_.clear (); - eomSendRequests_.clear (); +#endif + dataVectors_.clear (); + requestVectors_.clear (); + replyVectors_.clear (); } template class AxpyInterface < Int >; diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index adb800e93f..f2faeb1f11 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -17,7 +17,7 @@ which can be found in the LICENSE file in the root directory, or at // TODO Complete the const interfaces... // TODO RMA related checks pending (e.g bounds checking)... -// TODO Consider DDT +// TODO Use DDT for put/get/acc when EL_USE_DERIVED_TYPE is defined // TODO Use DEBUG_ONLY or something that EL provides instead of assert #if MPI_VERSION>=3 namespace El diff --git a/tests/core/HFsimulAxpyInt.cpp b/tests/core/HFsimulAxpyInt.cpp index 293637853e..e3acadd133 100644 --- a/tests/core/HFsimulAxpyInt.cpp +++ b/tests/core/HFsimulAxpyInt.cpp @@ -157,14 +157,31 @@ int main (int argc, char *argv[]) } interface.Detach (); #if DEBUG > 1 - for (int j = 0; j < commSize; j++) + if (DIM <= 20 && commSize < 16) { - if (j == commRank) + for (int j = 0; j < commSize; j++) { - // Process 0 can now locally print its copy of A - if (DIM <= 20) + if (j == commRank) + { + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A Print (C, "Patch of A"); + } } + mpi::Barrier ( comm ); + } + else + { + if ( commRank == 0 && k == (ITER-1) ) + std::cout << "Inifinity norm of local matrix after " + << k+1 << " iterations: " << InfinityNorm ( C ) << "\n"; } #endif } diff --git a/tests/core/HFsimulRMAInt.cpp b/tests/core/HFsimulRMAInt.cpp index a965b16d7f..1095ae1ddf 100644 --- a/tests/core/HFsimulRMAInt.cpp +++ b/tests/core/HFsimulRMAInt.cpp @@ -128,8 +128,8 @@ int main (int argc, char *argv[]) { for (int j = 0; j < DIM; j += AXPY_DIM) { - //Rmaint.Get (C, i, j); - Rmaint.LocalAcc (1.0, C, i, j); + Rmaint.Get (C, i, j); + //Rmaint.LocalAcc (1.0, C, i, j); #if DEBUG > 2 std::cout << std::to_string(commRank) + ": GET patch: " + std::to_string(i) + "," + std::to_string(j) @@ -144,25 +144,32 @@ int main (int argc, char *argv[]) Rmaint.Detach (); #if DEBUG > 1 - for (int j = 0; j < commSize; j++) + if (DIM <= 20 && commSize < 16) { - if (j == commRank) + for (int j = 0; j < commSize; j++) { - if (DIM <= 20) + if (j == commRank) + { Print (A, "Updated distributed A"); + } } - } - mpi::Barrier ( comm ); - for (int j = 0; j < commSize; j++) - { - if (j == commRank) + mpi::Barrier ( comm ); + for (int j = 0; j < commSize; j++) { - // Process 0 can now locally print its copy of A - if (DIM <= 20) + if (j == commRank) + { + // Process 0 can now locally print its copy of A Print (C, "Patch of A"); + } } + mpi::Barrier ( comm ); + } + else + { + if ( commRank == 0 && k == (ITER-1) ) + std::cout << "Inifinity norm of local matrix after " + << k+1 << " iterations: " << InfinityNorm ( C ) << "\n"; } - mpi::Barrier ( comm ); #endif } t2 = MPI_Wtime(); From e24ae01904767b371ff5fb170434cd2644d00abd Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 29 Jul 2014 11:12:20 -0500 Subject: [PATCH 053/110] removed scale as a parameter from RmaInterface, added a macro to enable derived type creation... --- include/El/core/RmaInterface.hpp | 11 ++++++----- include/El/core/imports/mpi.hpp | 2 ++ src/core/RmaInterface.cpp | 15 +++++++-------- src/core/imports/mpi.cpp | 4 +++- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 0fbe8d883e..17620a2340 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -36,13 +36,13 @@ class RmaInterface void Get( Matrix& Z, Int i, Int j ); void Get( const Matrix& Z, Int i, Int j ); - void Acc( T alpha, Matrix& Z, Int i, Int j ); - void Acc( T alpha, const Matrix& Z, Int i, Int j ); - void LocalAcc( T alpha, Matrix& Z, Int i, Int j ); + void Acc( Matrix& Z, Int i, Int j ); + void Acc( const Matrix& Z, Int i, Int j ); + void LocalAcc( Matrix& Z, Int i, Int j ); - void Flush( Matrix& Z, Int i, Int j ); + void Flush( Matrix& Z, Int i, Int j ); void Flush( const Matrix& Z, Int i, Int j ); - void Flush( Matrix& Z ); + void Flush( Matrix& Z ); void Flush( const Matrix& Z ); void Detach(); @@ -55,6 +55,7 @@ class RmaInterface DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; + bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; }; diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index d06a94c4f7..c7b7d1f9d2 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -101,7 +101,9 @@ typedef enum PARTIAL_ACC_ORDERING = 2, NO_ACC_ORDERING = 4 } acc_order_t; +#endif // for ddt +#ifdef EL_USE_DERIVED_DATATYPE typedef struct El_strided_s { unsigned num; diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index f2faeb1f11..1ae72cab5c 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -341,10 +341,10 @@ void RmaInterface::Get( const Matrix& Z, Int i, Int j ) DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) } -// scaled accumulate = Update Y(i:i+height-1,j:j+width-1) += scale X, +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template -void RmaInterface::Acc( T scale, Matrix& Z, Int i, Int j ) +void RmaInterface::Acc( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) @@ -399,13 +399,12 @@ void RmaInterface::Acc( T scale, Matrix& Z, Int i, Int j ) T* sendData = reinterpret_cast(sendBuffer); const T* XBuffer = Z.LockedBuffer(); - //src *= scale for( Int t=0; t::Acc( T scale, Matrix& Z, Int i, Int j ) } template -void RmaInterface::Acc( T scale, const Matrix& Z, Int i, Int j ) +void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) } -// scaled local accumulate, Z += scale * Get Y(i:i+height-1,j:j+width-1), +// local accumulate, Z += Get Y(i:i+height-1,j:j+width-1), // where Z is local matrix height x width template -void RmaInterface::LocalAcc( T scale, Matrix& Z, Int i, Int j ) +void RmaInterface::LocalAcc( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::LocalAcc")) // a call to Attach with a non-const DistMatrix must set @@ -507,7 +506,7 @@ void RmaInterface::LocalAcc( T scale, Matrix& Z, Int i, Int j ) T *YCol = Z.Buffer (0,rowShift+t*c); const T *XCol = &getData[t * localHeight]; for (Int s = 0; s < localHeight; ++s) - YCol[colShift+s*r] += scale * XCol[s]; + YCol[colShift+s*r] += XCol[s]; } // clear getVector_[destination].resize (0); diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 5a4b98c0bf..92a0a84ce8 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -503,6 +503,7 @@ void RmaProgress ( Comm comm ) } // TODO these functions for DDT creation are // completely untested +#ifdef EL_USE_DERIVED_DATATYPE void StridedDatatype (El_strided_t* stride_descr, Datatype old_type, Datatype* new_type, size_t* source_dims) @@ -547,7 +548,7 @@ void StridedDatatype (El_strided_t* stride_descr, reinterpret_cast(sizes), reinterpret_cast(stride_descr->offsets), MPI_ORDER_C, old_type, new_type) ); - + delete[] dims; delete[] sizes; } @@ -629,6 +630,7 @@ void VectorDatatype (El_iov_t * vect_descr, (const int *) vect_descr->sizes, vect_descr->offsets, old_type, new_type) ); } +#endif void WindowFree (Window & window) { From 1eea9cddc6cbd1fc7869c58139b24783e57300f5 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 29 Jul 2014 12:51:24 -0500 Subject: [PATCH 054/110] compacted acc function, got to test this --- src/core/RmaInterface.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 1ae72cab5c..f6f0ae7dc1 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -405,11 +405,8 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) const T* thisXCol = &XBuffer[(rowShift+t*c)*XLDim]; for( Int s=0; s::Flush( const Matrix& Z, Int i, Int j ) } } -// Are these only useful when the user wants to -// get/put the entire DistMatrix to it's local -// PE/everyone in world ? +// Perhaps this should be implemented as +// flush_all and not flush (rank, window) template void RmaInterface::Flush( Matrix& Z ) { From 85dde6ad9b8322d5868800bb39b96595392b2438 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 29 Jul 2014 14:26:06 -0500 Subject: [PATCH 055/110] remove const from Get params, in Get matrix always need to be updatable --- include/El/core/RmaInterface.hpp | 1 - src/core/RmaInterface.cpp | 163 +++++++++++++++++++++++++++++-- 2 files changed, 153 insertions(+), 11 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 17620a2340..1b2c52377f 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -34,7 +34,6 @@ class RmaInterface void Put( const Matrix& Z, Int i, Int j ); void Get( Matrix& Z, Int i, Int j ); - void Get( const Matrix& Z, Int i, Int j ); void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index f6f0ae7dc1..528ed84038 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -220,12 +220,12 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) putVector_[destination].resize( numEntries ); T* sendBuffer = putVector_[destination].data(); T* sendData = reinterpret_cast(sendBuffer); - const T* XBuffer = Z.LockedBuffer(); + T* XBuffer = Z.Buffer(); for( Int t=0; t void RmaInterface::Put( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int iLocalOffset = Length( i, Y.ColShift (), r ); + const Int jLocalOffset = Length( j, Y.RowShift (), c ); + + const Int YLDim = Y.LDim (); + + for( Int step=0; step(sendBuffer); + const T* XBuffer = Z.LockedBuffer(); + + for( Int t=0; t @@ -335,12 +409,6 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) } } -template -void RmaInterface::Get( const Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) -} - // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template @@ -397,12 +465,12 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) putVector_[destination].resize( numEntries ); T* sendBuffer = putVector_[destination].data(); T* sendData = reinterpret_cast(sendBuffer); - const T* XBuffer = Z.LockedBuffer(); + T* XBuffer = Z.Buffer(); for( Int t=0; t void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated."); + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative."); + + DistMatrix& Y = *GlobalArrayPut_; + + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix."); + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + const Int YLDim = Y.LDim (); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + const Int iLocalOffset = Length( i, Y.ColShift (), r ); + const Int jLocalOffset = Length( j, Y.RowShift (), c ); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step(sendBuffer); + const T* XBuffer = Z.LockedBuffer(); + + for( Int t=0; t Date: Tue, 29 Jul 2014 17:51:35 -0500 Subject: [PATCH 056/110] move winfree and winunlock in try block --- tests/core/HFsimulAxpyInt.cpp | 10 +++++----- tests/core/HFsimulRMAInt.cpp | 9 ++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/core/HFsimulAxpyInt.cpp b/tests/core/HFsimulAxpyInt.cpp index e3acadd133..15e7f430c4 100644 --- a/tests/core/HFsimulAxpyInt.cpp +++ b/tests/core/HFsimulAxpyInt.cpp @@ -192,17 +192,17 @@ int main (int argc, char *argv[]) MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if (commRank == 0) - printf("Time taken for AXPY (secs):%lf \n", total_secs); + printf("Time taken (secs):%lf \n", total_secs); + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); } catch (std::exception & e) { ReportException (e); } - // clear window object for FOP - MPI_Win_unlock_all (win); - MPI_Win_free (&win); - mpi::Finalize (); return 0; } diff --git a/tests/core/HFsimulRMAInt.cpp b/tests/core/HFsimulRMAInt.cpp index 1095ae1ddf..d7dda9d164 100644 --- a/tests/core/HFsimulRMAInt.cpp +++ b/tests/core/HFsimulRMAInt.cpp @@ -179,17 +179,16 @@ int main (int argc, char *argv[]) MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if (commRank == 0) - printf("Time taken for AXPY (secs):%lf \n", total_secs); + printf("Time taken (secs):%lf \n", total_secs); + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); } catch (std::exception & e) { ReportException (e); } - // clear window object for FOP - MPI_Win_unlock_all (win); - MPI_Win_free (&win); - mpi::Finalize (); return 0; } From e7b44c936e8920e112e35a2f716e32875a2493e7 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 30 Jul 2014 19:48:01 -0500 Subject: [PATCH 057/110] removing a barrier in detach, probably not required, we'll see --- src/core/RmaInterface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 528ed84038..80c59a22cb 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -863,7 +863,7 @@ void RmaInterface::Detach() GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); - mpi::Barrier( g.VCComm() ); + // mpi::Barrier( g.VCComm() ); attached_ = false; detached_ = true; From b5e1ddb0028da64c8beec9f2fde9fbf51cbea87e Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 4 Aug 2014 14:11:18 -0500 Subject: [PATCH 058/110] testing an implementation for nbc, this should be slightly fast, if not incorrect --- src/core/AxpyInterface.cpp | 12 ++++++++++++ src/core/RmaInterface.cpp | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index beec459105..72de60d8ac 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -232,12 +232,18 @@ namespace El const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + const Int index = replyVectors_[source].size(); + replyVectors_[source].resize (index + 1); + replyVectors_[source][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; + /* const Int index = replyVectors_[source].size(); for (Int i = 0; i < index; ++i) replyVectors_[source][i].resize ( bufferSize ); replyVectors_[source].resize (index + 1); replyVectors_[source][index].resize ( bufferSize ); mpi::Request dummy_request = mpi::REQUEST_NULL; + */ #else const Int index = ReadyForSend (bufferSize, replyVectors_[source], replySendRequests_[source], @@ -542,12 +548,18 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Int bufferSize = 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + const Int index = dataVectors_[destination].size(); + dataVectors_[destination].resize (index + 1); + dataVectors_[destination][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; + /* const Int index = dataVectors_[destination].size(); for (Int i = 0; i < index; ++i) dataVectors_[destination][i].resize ( bufferSize ); dataVectors_[destination].resize (index + 1); dataVectors_[destination][index].resize ( bufferSize ); mpi::Request dummy_request = mpi::REQUEST_NULL; + */ #else const Int index = ReadyForSend (bufferSize, dataVectors_[destination], diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 80c59a22cb..528ed84038 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -863,7 +863,7 @@ void RmaInterface::Detach() GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); - // mpi::Barrier( g.VCComm() ); + mpi::Barrier( g.VCComm() ); attached_ = false; detached_ = true; From bc0441af3fa00df3f3bf3c39b00dfa75a97f7b7d Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 5 Aug 2014 00:05:21 -0500 Subject: [PATCH 059/110] minor --- src/core/AxpyInterface.cpp | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 72de60d8ac..4afa55f4de 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -118,7 +118,7 @@ namespace El const Int count = mpi::GetCount < byte > (status); DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) LogicError ("Count was too small");) - const Int source = status.MPI_SOURCE; + const Int source = status.MPI_SOURCE; recvVector_.resize (count); byte *recvBuffer = recvVector_.data (); mpi::TaggedRecv (recvBuffer, count, source, DATA_TAG, g.VCComm ()); @@ -232,18 +232,12 @@ namespace El const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - const Int index = replyVectors_[source].size(); - replyVectors_[source].resize (index + 1); - replyVectors_[source][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; - /* const Int index = replyVectors_[source].size(); for (Int i = 0; i < index; ++i) replyVectors_[source][i].resize ( bufferSize ); replyVectors_[source].resize (index + 1); replyVectors_[source][index].resize ( bufferSize ); mpi::Request dummy_request = mpi::REQUEST_NULL; - */ #else const Int index = ReadyForSend (bufferSize, replyVectors_[source], replySendRequests_[source], @@ -548,18 +542,12 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Int bufferSize = 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - const Int index = dataVectors_[destination].size(); - dataVectors_[destination].resize (index + 1); - dataVectors_[destination][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; - /* const Int index = dataVectors_[destination].size(); for (Int i = 0; i < index; ++i) dataVectors_[destination][i].resize ( bufferSize ); dataVectors_[destination].resize (index + 1); dataVectors_[destination][index].resize ( bufferSize ); mpi::Request dummy_request = mpi::REQUEST_NULL; - */ #else const Int index = ReadyForSend (bufferSize, dataVectors_[destination], @@ -875,7 +863,8 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) mpi::Barrier (g.VCComm ()); #endif } - + // NOTE is this barrier needed here? + mpi::Barrier (g.VCComm ()); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = '0'; #endif From 843652574a56f8e9c5fef16116dfe649d4972bc8 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 5 Aug 2014 12:35:06 -0500 Subject: [PATCH 060/110] fixing nbc --- src/core/AxpyInterface.cpp | 72 +++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 4afa55f4de..d7bd600d50 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -118,7 +118,7 @@ namespace El const Int count = mpi::GetCount < byte > (status); DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) LogicError ("Count was too small");) - const Int source = status.MPI_SOURCE; + const Int source = status.MPI_SOURCE; recvVector_.resize (count); byte *recvBuffer = recvVector_.data (); mpi::TaggedRecv (recvBuffer, count, source, DATA_TAG, g.VCComm ()); @@ -232,12 +232,19 @@ namespace El const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + /* const Int index = replyVectors_[source].size(); - for (Int i = 0; i < index; ++i) + replyVectors_[source][0].resize ( bufferSize ); + for (Int i = 0; i < index; ++i) replyVectors_[source][i].resize ( bufferSize ); replyVectors_[source].resize (index + 1); replyVectors_[source][index].resize ( bufferSize ); mpi::Request dummy_request = mpi::REQUEST_NULL; + */ + const Int index = 0; + replyVectors_[source].resize (index + 1); + replyVectors_[source][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; #else const Int index = ReadyForSend (bufferSize, replyVectors_[source], replySendRequests_[source], @@ -259,12 +266,13 @@ namespace El const T *XCol = X.LockedBuffer (iLocalOffset, jLocalOffset + t); MemCopy (sendCol, XCol, localHeight); } - // Fire off non-blocking send #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) mpi::TaggedISSend (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), dummy_request); + // nonblocking ssends must have been issued + all_sends_are_finished = '1'; #else mpi::TaggedISSend (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), @@ -542,12 +550,18 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Int bufferSize = 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + /* const Int index = dataVectors_[destination].size(); for (Int i = 0; i < index; ++i) dataVectors_[destination][i].resize ( bufferSize ); dataVectors_[destination].resize (index + 1); dataVectors_[destination][index].resize ( bufferSize ); mpi::Request dummy_request = mpi::REQUEST_NULL; + */ + const Int index = 0; + dataVectors_[destination].resize (index + 1); + dataVectors_[destination][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; #else const Int index = ReadyForSend (bufferSize, dataVectors_[destination], @@ -622,12 +636,18 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) { const Int bufferSize = 4 * sizeof (Int); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + /* const Int index = requestVectors_[rank].size(); for (Int i = 0; i < index; ++i) requestVectors_[rank][i].resize ( bufferSize ); requestVectors_[rank].resize (index + 1); requestVectors_[rank][index].resize ( bufferSize ); mpi::Request dummy_request = mpi::REQUEST_NULL; + */ + const Int index = 0; + requestVectors_[rank].resize (index + 1); + requestVectors_[rank][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; #else const Int index = ReadyForSend (bufferSize, requestVectors_[rank], requestSendRequests_[rank], @@ -656,7 +676,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) requestSendRequests_[rank][index]); #endif } - // Receive all of the replies Int numReplies = 0; while (numReplies < p) @@ -702,7 +721,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) for (Int s = 0; s < localHeight; ++s) YCol[colShift + s * r] += alpha * XCol[s]; } - ++numReplies; } } @@ -829,9 +847,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; - // nonblocking ssends must have been issued - all_sends_are_finished = '1'; - // spin till all messages sent have been + // spin till all messages sent have been // received while (!DONE) { @@ -863,32 +879,33 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) mpi::Barrier (g.VCComm ()); #endif } - // NOTE is this barrier needed here? - mpi::Barrier (g.VCComm ()); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = '0'; -#endif + attachedForLocalToGlobal_ = false; attachedForGlobalToLocal_ = false; - recvVector_.clear (); - + recvVector_.clear(); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + all_sends_are_finished = '0'; #else - sentEomTo_.clear (); - haveEomFrom_.clear (); + sentEomTo_.clear(); + haveEomFrom_.clear(); + + sendingData_.clear(); + sendingRequest_.clear(); + sendingReply_.clear(); +#endif - sendingData_.clear (); - sendingRequest_.clear (); - sendingReply_.clear (); + dataVectors_.clear(); + requestVectors_.clear(); + replyVectors_.clear(); + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + dataSendRequests_.clear(); + requestSendRequests_.clear(); + replySendRequests_.clear(); - dataSendRequests_.clear (); - requestSendRequests_.clear (); - replySendRequests_.clear (); - eomSendRequests_.clear (); + eomSendRequests_.clear(); #endif - dataVectors_.clear (); - requestVectors_.clear (); - replyVectors_.clear (); } template class AxpyInterface < Int >; @@ -896,5 +913,4 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) template class AxpyInterface < double >; template class AxpyInterface < Complex < float >>; template class AxpyInterface < Complex < double >>; - } // namespace El From 062565cdfac48f2321fa80346f69cfdcac28dd3b Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 6 Aug 2014 13:59:43 -0500 Subject: [PATCH 061/110] cp axpyinterface to axpyint2...optimized 2 sided implementation --- include/El/core/AxpyInterface2.0.hpp | 102 +++ src/core/AxpyInterface2.0.cpp | 916 +++++++++++++++++++++++++++ 2 files changed, 1018 insertions(+) create mode 100644 include/El/core/AxpyInterface2.0.hpp create mode 100644 src/core/AxpyInterface2.0.cpp diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp new file mode 100644 index 0000000000..29473c3e29 --- /dev/null +++ b/include/El/core/AxpyInterface2.0.hpp @@ -0,0 +1,102 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + All rights reserved. + + Authors: + This interface is mainly due to Martin Schatz, but it was put into its + current form by Jack Poulson. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at + http://opensource.org/licenses/BSD-2-Clause +*/ +#pragma once +#ifndef EL_AXPYINTERFACE_HPP +#define EL_AXPYINTERFACE_HPP + +namespace El { + +namespace AxpyTypeNS { +enum AxpyType { LOCAL_TO_GLOBAL, GLOBAL_TO_LOCAL }; +} +using namespace AxpyTypeNS; + +template +class AxpyInterface +{ +public: + AxpyInterface(); + ~AxpyInterface(); + + AxpyInterface( AxpyType type, DistMatrix& Z ); + AxpyInterface( AxpyType type, const DistMatrix& Z ); + + void Attach( AxpyType type, DistMatrix& Z ); + void Attach( AxpyType type, const DistMatrix& Z ); + + void Axpy( T alpha, Matrix& Z, Int i, Int j ); + void Axpy( T alpha, const Matrix& Z, Int i, Int j ); + + void Detach(); + +private: + static const Int + DATA_TAG =1, + EOM_TAG =2, + DATA_REQUEST_TAG=3, + DATA_REPLY_TAG =4; + +//request object for polling on Issends +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + byte all_sends_are_finished; +#endif + bool attachedForLocalToGlobal_, attachedForGlobalToLocal_; + + DistMatrix* localToGlobalMat_; + const DistMatrix* globalToLocalMat_; + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + std::vector sentEomTo_, haveEomFrom_; + std::vector eomSendRequests_; + + std::vector> + sendingData_, sendingRequest_, sendingReply_; + std::vector> + dataSendRequests_, requestSendRequests_, replySendRequests_; +#endif + + std::vector recvVector_; + std::vector>> + dataVectors_, requestVectors_, replyVectors_; + + byte sendDummy_, recvDummy_; + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + // Check if we are done with this attachment's work + bool Finished(); + // Progress functions + void UpdateRequestStatuses(); + void HandleEoms(); + void StartSendingEoms(); + void FinishSendingEoms(); + + Int ReadyForSend + ( Int sendSize, + std::deque>& sendVectors, + std::deque& requests, + std::deque& requestStatuses ); +#endif + + void HandleLocalToGlobalData(); + void HandleGlobalToLocalRequest(); + + void AxpyLocalToGlobal( T alpha, const Matrix& X, Int i, Int j ); + void AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ); +}; + +} // namespace El + +#endif // ifndef EL_AXPYINTERFACE_HPP diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp new file mode 100644 index 0000000000..d7bd600d50 --- /dev/null +++ b/src/core/AxpyInterface2.0.cpp @@ -0,0 +1,916 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + All rights reserved. + + Authors: + This interface is mainly due to Martin Schatz, but it was put into its + current form by Jack Poulson. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at + http://opensource.org/licenses/BSD-2-Clause +*/ +#include "El-lite.hpp" + +namespace El +{ +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + template < typename T > bool AxpyInterface < T >::Finished () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Finished"); + if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) + LogicError ("Not attached");) + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + const Int p = g.Size (); + + bool finished = true; + for (Int rank = 0; rank < p; ++rank) + { + if (!sentEomTo_[rank] || !haveEomFrom_[rank]) + { + finished = false; + break; + } + } + return finished; + } + + template < typename T > void AxpyInterface < T >::HandleEoms () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleEoms")) + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + const Int p = g.Size (); + + UpdateRequestStatuses (); + + // Try to progress our EOM sends + for (Int i = 0; i < p; ++i) + { + if (!sentEomTo_[i]) + { + bool shouldSendEom = true; + const Int numSends = sendingData_[i].size (); + for (Int j = 0; j < numSends; ++j) + { + if (sendingData_[i][j]) + { + shouldSendEom = false; + break; + } + } + const Int numRequests = sendingRequest_[i].size (); + for (Int j = 0; j < numRequests; ++j) + { + if (!shouldSendEom || sendingRequest_[i][j]) + { + shouldSendEom = false; + break; + } + } + const Int numReplies = sendingReply_[i].size (); + for (Int j = 0; j < numReplies; ++j) + { + if (!shouldSendEom || sendingReply_[i][j]) + { + shouldSendEom = false; + break; + } + } + if (shouldSendEom) + { + mpi::Request & request = eomSendRequests_[i]; + mpi::TaggedISSend + (&sendDummy_, 1, i, EOM_TAG, g.VCComm (), request); + sentEomTo_[i] = true; + } + } + } + mpi::Status status; + if (mpi::IProbe (mpi::ANY_SOURCE, EOM_TAG, g.VCComm (), status)) + { + const Int source = status.MPI_SOURCE; + mpi::TaggedRecv (&recvDummy_, 1, source, EOM_TAG, g.VCComm ()); + haveEomFrom_[source] = true; + } + } +#endif // NO USE OF THESE P2P SYNC FUNCTIONS WHEN NBC IS ACTIVE + + template < typename T > void AxpyInterface < T >::HandleLocalToGlobalData () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) + DistMatrix < T > &Y = *localToGlobalMat_; + const Grid & g = Y.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + mpi::Status status; + + if (mpi::IProbe (mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status)) + { + // Message exists, so recv and pack + const Int count = mpi::GetCount < byte > (status); + DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) + LogicError ("Count was too small");) + const Int source = status.MPI_SOURCE; + recvVector_.resize (count); + byte *recvBuffer = recvVector_.data (); + mpi::TaggedRecv (recvBuffer, count, source, DATA_TAG, g.VCComm ()); + // Extract the header + byte *head = recvBuffer; + const Int i = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int j = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int height = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int width = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const T alpha = *reinterpret_cast < const T * >(head); + head += sizeof (T); + DEBUG_ONLY (if (height < 0 || width < 0) + RuntimeError + ("Unpacked heights were negative:\n", + " i= ", i, std::hex, "(", i, ")\n", std::dec, + " j= ", j, std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", height, ")\n", + std::dec, " width= ", width, std::hex, "(", width, + ")\n", std::dec, " alpha= ", alpha); + if (i < 0 + || j < + 0) RuntimeError ("Unpacked offsets were negative:\n", + " i= ", i, std::hex, "(", i, + ")\n", std::dec, " j= ", j, + std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", + height, ")\n", std::dec, " width= ", + width, std::hex, "(", width, ")\n", + std::dec, " alpha= ", alpha); + if (i + height > Y.Height () + || j + width > + Y.Width ())RuntimeError + ("Unpacked submatrix was out of bounds:\n", " i= ", + i, std::hex, "(", i, ")\n", std::dec, " j= ", j, + std::hex, "(", j, ")\n", std::dec, " height=", height, + std::hex, "(", height, ")\n", std::dec, " width= ", + width, std::hex, "(", width, ")\n", std::dec, + " alpha= ", alpha);) + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(head); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += alpha * XCol[s]; + } + // Free the memory for the recv buffer + recvVector_.clear (); + } + } + + template < typename T > + void AxpyInterface < T >::HandleGlobalToLocalRequest () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalRequest")) + const DistMatrix < T > &X = *globalToLocalMat_; + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + + mpi::Status status; + if (mpi::IProbe (mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm (), status)) + { + // Request exists, so recv + const Int source = status.MPI_SOURCE; + const Int recvSize = 4 * sizeof (Int); + recvVector_.resize (recvSize); + byte *recvBuffer = recvVector_.data (); + mpi::TaggedRecv + (recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm ()); + + // Extract the header + const byte *recvHead = recvBuffer; + const Int i = *reinterpret_cast < const Int * >(recvHead); + recvHead += sizeof (Int); + const Int j = *reinterpret_cast < const Int * >(recvHead); + recvHead += sizeof (Int); + const Int height = *reinterpret_cast < const Int * >(recvHead); + recvHead += sizeof (Int); + const Int width = *reinterpret_cast < const Int * >(recvHead); + recvHead += sizeof (Int); + + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int numEntries = localHeight * localWidth; + + const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + /* + const Int index = replyVectors_[source].size(); + replyVectors_[source][0].resize ( bufferSize ); + for (Int i = 0; i < index; ++i) + replyVectors_[source][i].resize ( bufferSize ); + replyVectors_[source].resize (index + 1); + replyVectors_[source][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; + */ + const Int index = 0; + replyVectors_[source].resize (index + 1); + replyVectors_[source][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; +#else + const Int index = ReadyForSend (bufferSize, replyVectors_[source], + replySendRequests_[source], + sendingReply_[source]); +#endif + // Pack the reply header + byte *sendBuffer = replyVectors_[source][index].data (); + byte *sendHead = sendBuffer; + *reinterpret_cast < Int * >(sendHead) = myRow; + sendHead += sizeof (Int); + *reinterpret_cast < Int * >(sendHead) = myCol; + sendHead += sizeof (Int); + + // Pack the payload + T *sendData = reinterpret_cast < T * >(sendHead); + for (Int t = 0; t < localWidth; ++t) + { + T *sendCol = &sendData[t * localHeight]; + const T *XCol = X.LockedBuffer (iLocalOffset, jLocalOffset + t); + MemCopy (sendCol, XCol, localHeight); + } + // Fire off non-blocking send +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + mpi::TaggedISSend + (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), + dummy_request); + // nonblocking ssends must have been issued + all_sends_are_finished = '1'; +#else + mpi::TaggedISSend + (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), + replySendRequests_[source][index]); +#endif + } + } + +template +AxpyInterface::AxpyInterface() +: attachedForLocalToGlobal_(false), attachedForGlobalToLocal_(false), + localToGlobalMat_(0), globalToLocalMat_(0), + sendDummy_(0), recvDummy_(0) +{ } + +template +AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) +: sendDummy_(0), recvDummy_(0) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) + if( type == LOCAL_TO_GLOBAL ) + { + attachedForLocalToGlobal_ = true; + attachedForGlobalToLocal_ = false; + localToGlobalMat_ = &Z; + globalToLocalMat_ = 0; + } + else + { + attachedForLocalToGlobal_ = false; + attachedForGlobalToLocal_ = true; + localToGlobalMat_ = 0; + globalToLocalMat_ = &Z; + } + + const Int p = Z.Grid().Size(); + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + sentEomTo_.resize( p, false ); + haveEomFrom_.resize( p, false ); + + sendingData_.resize( p ); + sendingRequest_.resize( p ); + sendingReply_.resize( p ); + + dataSendRequests_.resize( p ); + requestSendRequests_.resize( p ); + replySendRequests_.resize( p ); + + eomSendRequests_.resize( p ); +#endif + + dataVectors_.resize( p ); + requestVectors_.resize( p ); + replyVectors_.resize( p ); +} + +template +AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) +: sendDummy_(0), recvDummy_(0) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) + if( type == LOCAL_TO_GLOBAL ) + { + LogicError("Cannot update a constant matrix"); + } + else + { + attachedForLocalToGlobal_ = false; + attachedForGlobalToLocal_ = true; + localToGlobalMat_ = 0; + globalToLocalMat_ = &X; + } + + const Int p = X.Grid ().Size (); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + sentEomTo_.resize (p, false); + haveEomFrom_.resize (p, false); + + sendingData_.resize (p); + sendingRequest_.resize (p); + sendingReply_.resize (p); + + dataSendRequests_.resize (p); + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + + eomSendRequests_.resize (p); +#endif + + dataVectors_.resize (p); + requestVectors_.resize (p); + replyVectors_.resize (p); + } + + template < typename T > AxpyInterface < T >::~AxpyInterface () + { + if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) + { + if (std::uncaught_exception ()) + { + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + std::ostringstream os; + os << g.Rank () + << + "Uncaught exception detected during AxpyInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str (); + DEBUG_ONLY (DumpCallStack ())} + else + { + Detach (); + } + } + } + + template < typename T > + void AxpyInterface < T >::Attach (AxpyType type, DistMatrix < T > &Z) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Attach")) + if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) + LogicError ("Must detach before reattaching."); + + const Grid & g = Z.Grid (); + + if (type == LOCAL_TO_GLOBAL) + { + attachedForLocalToGlobal_ = true; + localToGlobalMat_ = &Z; + } + else + { + attachedForGlobalToLocal_ = true; + globalToLocalMat_ = &Z; + } +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + all_sends_are_finished = '0'; +#endif + const Int p = Z.Grid ().Size (); + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + // eom + sentEomTo_.resize (p, false); + haveEomFrom_.resize (p, false); + // request objects + sendingRequest_.resize (p); + dataSendRequests_.resize (p); + eomSendRequests_.resize (p); + // ready-to-send + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + sendingData_.resize (p); + sendingReply_.resize (p); +#endif + + dataVectors_.resize (p); + requestVectors_.resize (p); + replyVectors_.resize (p); + } + + template < typename T > + void AxpyInterface < T >::Attach (AxpyType type, + const DistMatrix < T > &X) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Attach")) + if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) + LogicError ("Must detach before reattaching."); + + if (type == LOCAL_TO_GLOBAL) + { + LogicError ("Cannot update a constant matrix"); + } + else + { + attachedForGlobalToLocal_ = true; + globalToLocalMat_ = &X; + } +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + all_sends_are_finished = '0'; +#endif + const Int p = X.Grid ().Size (); + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + // eom + sentEomTo_.resize (p, false); + haveEomFrom_.resize (p, false); + // request objects + dataSendRequests_.resize (p); + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + eomSendRequests_.resize (p); + // ready-to-send + sendingRequest_.resize (p); + sendingData_.resize (p); + sendingReply_.resize (p); +#endif + + dataVectors_.resize (p); + replyVectors_.resize (p); + requestVectors_.resize (p); + } + + template < typename T > + void AxpyInterface < T >::Axpy (T alpha, Matrix < T > &Z, Int i, Int j) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Axpy")) + if (attachedForLocalToGlobal_) + AxpyLocalToGlobal (alpha, Z, i, j); + else if (attachedForGlobalToLocal_) + AxpyGlobalToLocal (alpha, Z, i, j); + else + LogicError ("Cannot axpy before attaching."); + } + + template < typename T > + void AxpyInterface < T >::Axpy (T alpha, const Matrix < T > &Z, Int i, + Int j) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Axpy")) + if (attachedForLocalToGlobal_) + AxpyLocalToGlobal (alpha, Z, i, j); + else if (attachedForGlobalToLocal_) + LogicError ("Cannot update a constant matrix."); + else + LogicError ("Cannot axpy before attaching."); + } + +// Update Y(i:i+height-1,j:j+width-1) += alpha X, where X is height x width + template < typename T > + void AxpyInterface < T >::AxpyLocalToGlobal + (T alpha, const Matrix < T > &X, Int i, Int j) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyLocalToGlobal")) + DistMatrix < T > &Y = *localToGlobalMat_; + if (i < 0 || j < 0) + LogicError ("Submatrix offsets must be non-negative"); + if (i + X.Height () > Y.Height () || j + X.Width () > Y.Width ()) + LogicError ("Submatrix out of bounds of global matrix"); + + const Grid & g = Y.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + const Int myProcessRow = g.Row (); + const Int myProcessCol = g.Col (); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + + const Int height = X.Height (); + const Int width = X.Width (); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for (Int step = 0; step < p; ++step) + { + const Int colShift = Shift (receivingRow, colAlign, r); + const Int rowShift = Shift (receivingCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int numEntries = localHeight * localWidth; + + if (numEntries != 0) + { + const Int destination = receivingRow + r * receivingCol; + const Int bufferSize = + 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + /* + const Int index = dataVectors_[destination].size(); + for (Int i = 0; i < index; ++i) + dataVectors_[destination][i].resize ( bufferSize ); + dataVectors_[destination].resize (index + 1); + dataVectors_[destination][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; + */ + const Int index = 0; + dataVectors_[destination].resize (index + 1); + dataVectors_[destination][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; +#else + const Int index = + ReadyForSend (bufferSize, dataVectors_[destination], + dataSendRequests_[destination], + sendingData_[destination]); +#endif + DEBUG_ONLY (if + (Int (dataVectors_[destination][index].size ()) != + bufferSize) LogicError ("Error in ReadyForSend");) + // Pack the header + byte *sendBuffer = dataVectors_[destination][index].data (); + byte *head = sendBuffer; + *reinterpret_cast < Int * >(head) = i; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = j; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = height; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = width; + head += sizeof (Int); + *reinterpret_cast < T * >(head) = alpha; + head += sizeof (T); + + // Pack the payload + T *sendData = reinterpret_cast < T * >(head); + const T *XBuffer = X.LockedBuffer (); + const Int XLDim = X.LDim (); + for (Int t = 0; t < localWidth; ++t) + { + T *thisSendCol = &sendData[t * localHeight]; + const T *thisXCol = &XBuffer[(rowShift + t * c) * XLDim]; + for (Int s = 0; s < localHeight; ++s) + thisSendCol[s] = thisXCol[colShift + s * r]; + } + // Fire off the non-blocking send +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + mpi::TaggedISSend + (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), + dummy_request); +#else + mpi::TaggedISSend + (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), + dataSendRequests_[destination][index]); +#endif + } + receivingRow = (receivingRow + 1) % r; + if (receivingRow == 0) + receivingCol = (receivingCol + 1) % c; + } + } + +// Update Y += alpha X(i:i+height-1,j:j+width-1), where X is the dist-matrix + template < typename T > + void AxpyInterface < T >::AxpyGlobalToLocal (T alpha, Matrix < T > &Y, + Int i, Int j) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyGlobalToLocal")) + const DistMatrix < T > &X = *globalToLocalMat_; + + const Int height = Y.Height (); + const Int width = Y.Width (); + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid AxpyGlobalToLocal submatrix"); + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + + // Send out the requests to all processes in the grid + for (Int rank = 0; rank < p; ++rank) + { + const Int bufferSize = 4 * sizeof (Int); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + /* + const Int index = requestVectors_[rank].size(); + for (Int i = 0; i < index; ++i) + requestVectors_[rank][i].resize ( bufferSize ); + requestVectors_[rank].resize (index + 1); + requestVectors_[rank][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; + */ + const Int index = 0; + requestVectors_[rank].resize (index + 1); + requestVectors_[rank][index].resize ( bufferSize ); + mpi::Request dummy_request = mpi::REQUEST_NULL; +#else + const Int index = ReadyForSend (bufferSize, requestVectors_[rank], + requestSendRequests_[rank], + sendingRequest_[rank]); +#endif + // Copy the request header into the send buffer + byte *sendBuffer = requestVectors_[rank][index].data (); + byte *head = sendBuffer; + *reinterpret_cast < Int * >(head) = i; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = j; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = height; + head += sizeof (Int); + *reinterpret_cast < Int * >(head) = width; + head += sizeof (Int); + + // Begin the non-blocking send +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + mpi::TaggedISSend + (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), + dummy_request); +#else + mpi::TaggedISSend + (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), + requestSendRequests_[rank][index]); +#endif + } + // Receive all of the replies + Int numReplies = 0; + while (numReplies < p) + { + HandleGlobalToLocalRequest (); + mpi::Status status; + + if (mpi::IProbe + (mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm (), status)) + { + const Int source = status.MPI_SOURCE; + + // Ensure that we have a recv buffer + const Int count = mpi::GetCount < byte > (status); + recvVector_.resize (count); + byte *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm ()); + + // Unpack the reply header + const byte *head = recvBuffer; + const Int row = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const Int col = *reinterpret_cast < const Int * >(head); + head += sizeof (Int); + const T *recvData = reinterpret_cast < const T * >(head); + + // Compute the local heights and offsets + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (row, colAlign, r); + const Int rowShift = Shift (col, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (0, rowShift + t * c); + const T *XCol = &recvData[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] += alpha * XCol[s]; + } + ++numReplies; + } + } + } + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + template < typename T > + Int AxpyInterface < T >::ReadyForSend + (Int sendSize, + std::deque < std::vector < byte >> &sendVectors, + std::deque < mpi::Request > &requests, + std::deque < bool > &requestStatuses) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::ReadyForSend")) + const Int numCreated = sendVectors.size (); + DEBUG_ONLY (if (numCreated != Int (requests.size ()) || + numCreated != + Int (requestStatuses.size ()))LogicError + ("size mismatch");) + for (Int i = 0; i < numCreated; ++i) + { + // If this request is still running, test to see if it finished. + if (requestStatuses[i]) + { + const bool finished = mpi::Test (requests[i]); + requestStatuses[i] = !finished; + } + + if (!requestStatuses[i]) + { + requestStatuses[i] = true; + sendVectors[i].resize (sendSize); + return i; + } + } + sendVectors.resize (numCreated + 1); + sendVectors[numCreated].resize (sendSize); + requests.push_back (mpi::REQUEST_NULL); + requestStatuses.push_back (true); + + return numCreated; + } + + template < typename T > void AxpyInterface < T >::UpdateRequestStatuses () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + const Int p = g.Size (); + + for (Int i = 0; i < p; ++i) + { + const Int numDataSendRequests = dataSendRequests_[i].size (); + for (Int j = 0; j < numDataSendRequests; ++j) + if (sendingData_[i][j]) + sendingData_[i][j] = !mpi::Test (dataSendRequests_[i][j]); + const Int numRequestSendRequests = requestSendRequests_[i].size (); + for (Int j = 0; j < numRequestSendRequests; ++j) + if (sendingRequest_[i][j]) + sendingRequest_[i][j] = !mpi::Test (requestSendRequests_[i][j]); + const Int numReplySendRequests = replySendRequests_[i].size (); + for (Int j = 0; j < numReplySendRequests; ++j) + if (sendingReply_[i][j]) + sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); + } + } +#endif // NO USE OF THESE P2P SYNC FUNCTIONS WHEN NBC IS ACTIVE + + template < typename T > void AxpyInterface < T >::Detach () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Detach")) + if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) + LogicError ("Must attach before detaching."); + + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : globalToLocalMat_-> + Grid ()); + + if (attachedForLocalToGlobal_) + { +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + // nonblocking ssends must have been issued + all_sends_are_finished = '1'; + // spin till all messages sent have been + // received + while (!DONE) + { + // probes for incoming message and + // receive + HandleLocalToGlobalData (); + + if (nb_bar_active) + { + // test/wait for IBarrier completion + DONE = mpi::Test (nb_bar_request); + } + else + { + if (all_sends_are_finished == '1') + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier (g.VCComm (), nb_bar_request); + nb_bar_active = true; + } + } + } +#else + while (!Finished ()) + { + HandleLocalToGlobalData (); + HandleEoms (); + } + mpi::Barrier (g.VCComm ()); +#endif + } + else + { +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + // spin till all messages sent have been + // received + while (!DONE) + { + // probes for incoming message requests + // receives, and posts reply + HandleGlobalToLocalRequest (); + + if (nb_bar_active) + { + // test/wait for IBarrier completion + DONE = mpi::Test (nb_bar_request); + } + else + { + if (all_sends_are_finished == '1') + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier (g.VCComm (), nb_bar_request); + nb_bar_active = true; + } + } + } +#else + while (!Finished ()) + { + HandleGlobalToLocalRequest (); + HandleEoms (); + } + mpi::Barrier (g.VCComm ()); +#endif + } + + attachedForLocalToGlobal_ = false; + attachedForGlobalToLocal_ = false; + recvVector_.clear(); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + all_sends_are_finished = '0'; +#else + sentEomTo_.clear(); + haveEomFrom_.clear(); + + sendingData_.clear(); + sendingRequest_.clear(); + sendingReply_.clear(); +#endif + + dataVectors_.clear(); + requestVectors_.clear(); + replyVectors_.clear(); + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else + dataSendRequests_.clear(); + requestSendRequests_.clear(); + replySendRequests_.clear(); + + eomSendRequests_.clear(); +#endif + } + + template class AxpyInterface < Int >; + template class AxpyInterface < float >; + template class AxpyInterface < double >; + template class AxpyInterface < Complex < float >>; + template class AxpyInterface < Complex < double >>; +} // namespace El From 793a1a98fd2e158d2604d37c7430e3bdf539f3dd Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Sat, 9 Aug 2014 02:42:34 -0500 Subject: [PATCH 062/110] intermediate commit for axpy 2.0, to fix rma and orig nbc soon --- include/El/core.hpp | 1 + include/El/core/AxpyInterface2.0.hpp | 151 ++- include/El/core/RmaInterface.hpp | 3 +- src/core/AxpyInterface2.0.cpp | 1677 +++++++++++++------------- src/core/RmaInterface.cpp | 8 +- 5 files changed, 907 insertions(+), 933 deletions(-) diff --git a/include/El/core.hpp b/include/El/core.hpp index a73e9c404e..28bc6d6622 100644 --- a/include/El/core.hpp +++ b/include/El/core.hpp @@ -149,5 +149,6 @@ template class BlockDistMatrix; #include "El/core/random/impl.hpp" #include "El/core/AxpyInterface.hpp" #include "El/core/RmaInterface.hpp" +#include "El/core/AxpyInterface2.0.hpp" #endif // ifndef EL_CORE_HPP diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 29473c3e29..a5cde92a68 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -1,102 +1,89 @@ /* - Copyright (c) 2009-2014, Jack Poulson - Copyright (c) 2011, The University of Texas at Austin - All rights reserved. - - Authors: - This interface is mainly due to Martin Schatz, but it was put into its - current form by Jack Poulson. - - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ #pragma once -#ifndef EL_AXPYINTERFACE_HPP -#define EL_AXPYINTERFACE_HPP +#ifndef EL_AXPYINTERFACE2_HPP +#define EL_AXPYINTERFACE2_HPP namespace El { - -namespace AxpyTypeNS { -enum AxpyType { LOCAL_TO_GLOBAL, GLOBAL_TO_LOCAL }; -} -using namespace AxpyTypeNS; - template -class AxpyInterface -{ +class AxpyInterface2 +{ public: - AxpyInterface(); - ~AxpyInterface(); - - AxpyInterface( AxpyType type, DistMatrix& Z ); - AxpyInterface( AxpyType type, const DistMatrix& Z ); + AxpyInterface2(); + ~AxpyInterface2(); - void Attach( AxpyType type, DistMatrix& Z ); - void Attach( AxpyType type, const DistMatrix& Z ); + AxpyInterface2( DistMatrix& Z ); + AxpyInterface2( const DistMatrix& Z ); - void Axpy( T alpha, Matrix& Z, Int i, Int j ); - void Axpy( T alpha, const Matrix& Z, Int i, Int j ); + void Attach( DistMatrix& Z ); + void Attach( const DistMatrix& Z ); - void Detach(); + void Put( Matrix& Z, Int i, Int j ); + void Put( const Matrix& Z, Int i, Int j ); -private: - static const Int - DATA_TAG =1, - EOM_TAG =2, - DATA_REQUEST_TAG=3, - DATA_REPLY_TAG =4; - -//request object for polling on Issends -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - byte all_sends_are_finished; -#endif - bool attachedForLocalToGlobal_, attachedForGlobalToLocal_; + void Get( Matrix& Z, Int i, Int j ); - DistMatrix* localToGlobalMat_; - const DistMatrix* globalToLocalMat_; + void Acc( Matrix& Z, Int i, Int j ); + void Acc( const Matrix& Z, Int i, Int j ); + void LocalAcc( Matrix& Z, Int i, Int j ); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - std::vector sentEomTo_, haveEomFrom_; - std::vector eomSendRequests_; + void Flush( Matrix& Z, Int i, Int j ); + void Flush( const Matrix& Z, Int i, Int j ); + void Flush( Matrix& Z ); + void Flush( const Matrix& Z ); + + void LocalFlush( Matrix& Z, Int i, Int j ); + void LocalFlush( const Matrix& Z, Int i, Int j ); + + void Detach(); - std::vector> - sendingData_, sendingRequest_, sendingReply_; +private: + + static const Int + DATA_PUT_TAG =1, + DATA_GET_TAG =2, + DATA_ACC_TAG =3, + DATA_LCC_TAG =4; + + /* Meta */ std::vector> - dataSendRequests_, requestSendRequests_, replySendRequests_; -#endif + dataRequests_; + std::vector> + dataRequestStatuses_; + std::vector> + matrixBase_; + std::vector> + opKind_; + /* Data */ + std::vector>> + getVectors_, putVectors_; + + DistMatrix* GlobalArrayPut_; + DistMatrix* GlobalArrayGet_; - std::vector recvVector_; - std::vector>> - dataVectors_, requestVectors_, replyVectors_; + bool toBeAttachedForPut_, toBeAttachedForGet_, + attached_, detached_, sends_complete_; + + Int GetIndexForMatrix ( Matrix& Z, const Int rank ); + void ProgressMatrix ( Matrix& Z, const Int rank ); + Int GetMatrixType ( Matrix& Z ); + + Int NextIndex (Int dataSize, + std::deque> &dataVectors, + std::deque &requests, + std::deque &requestStatus, + std::deque &opKind, + Int op, + std::deque &matrixBase, + T * base); - byte sendDummy_, recvDummy_; - -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - // Check if we are done with this attachment's work - bool Finished(); - // Progress functions - void UpdateRequestStatuses(); - void HandleEoms(); - void StartSendingEoms(); - void FinishSendingEoms(); - - Int ReadyForSend - ( Int sendSize, - std::deque>& sendVectors, - std::deque& requests, - std::deque& requestStatuses ); -#endif - - void HandleLocalToGlobalData(); - void HandleGlobalToLocalRequest(); - - void AxpyLocalToGlobal( T alpha, const Matrix& X, Int i, Int j ); - void AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ); + void HandleLocalToGlobalData( Matrix& Z, Int i, Int j ); + void HandleGlobalToLocalData( Matrix& Z, Int i, Int j ); + void HandleLocalToGlobalAcc( Matrix& Z, Int i, Int j ); + void HandleGlobalToLocalAcc( Matrix& Z, Int i, Int j ); }; - } // namespace El - -#endif // ifndef EL_AXPYINTERFACE_HPP +#endif // ifndef EL_AXPYINTERFACE2_HPP diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 1b2c52377f..e324a2efa4 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -56,7 +56,8 @@ class RmaInterface const DistMatrix* GlobalArrayGet_; bool toBeAttachedForPut_, toBeAttachedForGet_, - attached_, detached_; + attached_, detached_, preceeding_put_, + preceeding_get_; }; #endif //MPI-3 } // namespace El diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index d7bd600d50..475e280a57 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -1,916 +1,895 @@ /* - Copyright (c) 2009-2014, Jack Poulson - Copyright (c) 2011, The University of Texas at Austin - All rights reserved. - - Authors: - This interface is mainly due to Martin Schatz, but it was put into its - current form by Jack Poulson. - - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at - http://opensource.org/licenses/BSD-2-Clause +This file is part of Elemental and is under the BSD 2-Clause License, +which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause */ #include "El-lite.hpp" +#include +// TODO Use DDT for put/get/acc when EL_USE_DERIVED_TYPE is defined namespace El { -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - template < typename T > bool AxpyInterface < T >::Finished () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Finished"); - if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) - LogicError ("Not attached");) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); - const Int p = g.Size (); +template +AxpyInterface2::AxpyInterface2() + : GlobalArrayPut_(0), GlobalArrayGet_(0), + putVectors_(0), getVectors_(0), dataRequests_(0), + dataRequestStatuses_(0), matrixBase_(0), opKind_(0), + toBeAttachedForPut_(false), toBeAttachedForGet_(false), + attached_(false), detached_(false) +{ } + +template +AxpyInterface2::AxpyInterface2( DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::AxpyInterface2")) + + attached_ = false; + detached_ = false; + toBeAttachedForGet_ = true; + toBeAttachedForPut_ = true; + GlobalArrayPut_ = &Z; + GlobalArrayGet_ = &Z; + + const Int p = Z.Grid ().Size(); + putVectors_.resize( p ); + getVectors_.resize( p ); + dataRequests_.resize (p); + dataRequestStatuses_.resize (p); + matrixBase_.resize (p); + opKind_.resize (p); +} + +template +AxpyInterface2::~AxpyInterface2() +{ + if( std::uncaught_exception() ) + { + std::ostringstream os; + os << "Uncaught exception detected during AxpyInterface2 destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY(DumpCallStack()) + } + else + { + Detach(); + } +} + +template +void AxpyInterface2::Attach( DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Attach")) + // attached_ will be only set in Attach + // and only unset in Detach + if (!attached_) + attached_ = true; + else + LogicError("Must detach before reattaching."); - bool finished = true; - for (Int rank = 0; rank < p; ++rank) - { - if (!sentEomTo_[rank] || !haveEomFrom_[rank]) - { - finished = false; - break; - } - } - return finished; - } - - template < typename T > void AxpyInterface < T >::HandleEoms () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleEoms")) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); + // if DistMatrix is non-const, all one-sided + // transfers -- put, get and acc are possible + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + { + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + } + const Grid& g = Z.Grid(); const Int p = g.Size (); - UpdateRequestStatuses (); - - // Try to progress our EOM sends - for (Int i = 0; i < p; ++i) - { - if (!sentEomTo_[i]) - { - bool shouldSendEom = true; - const Int numSends = sendingData_[i].size (); - for (Int j = 0; j < numSends; ++j) - { - if (sendingData_[i][j]) - { - shouldSendEom = false; - break; - } - } - const Int numRequests = sendingRequest_[i].size (); - for (Int j = 0; j < numRequests; ++j) - { - if (!shouldSendEom || sendingRequest_[i][j]) - { - shouldSendEom = false; - break; - } - } - const Int numReplies = sendingReply_[i].size (); - for (Int j = 0; j < numReplies; ++j) - { - if (!shouldSendEom || sendingReply_[i][j]) - { - shouldSendEom = false; - break; - } - } - if (shouldSendEom) - { - mpi::Request & request = eomSendRequests_[i]; - mpi::TaggedISSend - (&sendDummy_, 1, i, EOM_TAG, g.VCComm (), request); - sentEomTo_[i] = true; - } - } - } - mpi::Status status; - if (mpi::IProbe (mpi::ANY_SOURCE, EOM_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - mpi::TaggedRecv (&recvDummy_, 1, source, EOM_TAG, g.VCComm ()); - haveEomFrom_[source] = true; - } - } -#endif // NO USE OF THESE P2P SYNC FUNCTIONS WHEN NBC IS ACTIVE - - template < typename T > void AxpyInterface < T >::HandleLocalToGlobalData () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) - DistMatrix < T > &Y = *localToGlobalMat_; - const Grid & g = Y.Grid (); + if (putVectors_.size() != p) + { + getVectors_.resize( p ); + putVectors_.resize( p ); + dataRequests_.resize (p); + dataRequestStatuses_.resize (p); + matrixBase_.resize (p); + opKind_.resize (p); + } +} + +template +Int AxpyInterface2::NextIndex +( Int dataSize, std::deque > &dataVectors, + std::deque &requests, + std::deque &requestStatus, + std::deque &opKind, + Int op, + std::deque &matrixBase, + T * base_address) +{ + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) + const Int Index = Int(requests.size ()); + + dataVectors[Index].resize (dataSize); + requests.push_back (mpi::REQUEST_NULL); + requestStatus.push_back (true); + opKind.push_back (op); + // stores Matrix base address by index + matrixBase.push_back (base_address); + + return Index; +} + +template +void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim (); + + for( Int step=0; step +void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + + const DistMatrix &X = *GlobalArrayGet_; + + const Grid & g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); + const Int p = g.Size (); const Int myRow = g.Row (); const Int myCol = g.Col (); - mpi::Status status; - - if (mpi::IProbe (mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status)) - { - // Message exists, so recv and pack - const Int count = mpi::GetCount < byte > (status); - DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) - LogicError ("Count was too small");) - const Int source = status.MPI_SOURCE; - recvVector_.resize (count); - byte *recvBuffer = recvVector_.data (); - mpi::TaggedRecv (recvBuffer, count, source, DATA_TAG, g.VCComm ()); - // Extract the header - byte *head = recvBuffer; - const Int i = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int j = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int height = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int width = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const T alpha = *reinterpret_cast < const T * >(head); - head += sizeof (T); - DEBUG_ONLY (if (height < 0 || width < 0) - RuntimeError - ("Unpacked heights were negative:\n", - " i= ", i, std::hex, "(", i, ")\n", std::dec, - " j= ", j, std::hex, "(", j, ")\n", std::dec, - " height=", height, std::hex, "(", height, ")\n", - std::dec, " width= ", width, std::hex, "(", width, - ")\n", std::dec, " alpha= ", alpha); - if (i < 0 - || j < - 0) RuntimeError ("Unpacked offsets were negative:\n", - " i= ", i, std::hex, "(", i, - ")\n", std::dec, " j= ", j, - std::hex, "(", j, ")\n", std::dec, - " height=", height, std::hex, "(", - height, ")\n", std::dec, " width= ", - width, std::hex, "(", width, ")\n", - std::dec, " alpha= ", alpha); - if (i + height > Y.Height () - || j + width > - Y.Width ())RuntimeError - ("Unpacked submatrix was out of bounds:\n", " i= ", - i, std::hex, "(", i, ")\n", std::dec, " j= ", j, - std::hex, "(", j, ")\n", std::dec, " height=", height, - std::hex, "(", height, ")\n", std::dec, " width= ", - width, std::hex, "(", width, ")\n", std::dec, - " alpha= ", alpha);) - - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(head); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); - - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += alpha * XCol[s]; - } - // Free the memory for the recv buffer - recvVector_.clear (); - } - } - - template < typename T > - void AxpyInterface < T >::HandleGlobalToLocalRequest () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalRequest")) - const DistMatrix < T > &X = *globalToLocalMat_; + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError("Submatrix out of bounds of global matrix"); + + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; + + const Int XLDim = X.LDim (); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step(getVectors_[source][index].data ()); + // get request + mpi::TaggedIRecv (getBuffer, numEntries, source, DATA_GET_TAG, + g.VCComm (), dataRequests_[source][index]); + } + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } +} + +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width +template +void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim (); + + for( Int step=0; step +void AxpyInterface2::LocalAcc( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::LocalAcc")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); +if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + + const DistMatrix &X = *GlobalArrayGet_; + const Grid & g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); + const Int p = g.Size (); const Int myRow = g.Row (); const Int myCol = g.Col (); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); - mpi::Status status; - if (mpi::IProbe (mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm (), status)) - { - // Request exists, so recv - const Int source = status.MPI_SOURCE; - const Int recvSize = 4 * sizeof (Int); - recvVector_.resize (recvSize); - byte *recvBuffer = recvVector_.data (); - mpi::TaggedRecv - (recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm ()); - - // Extract the header - const byte *recvHead = recvBuffer; - const Int i = *reinterpret_cast < const Int * >(recvHead); - recvHead += sizeof (Int); - const Int j = *reinterpret_cast < const Int * >(recvHead); - recvHead += sizeof (Int); - const Int height = *reinterpret_cast < const Int * >(recvHead); - recvHead += sizeof (Int); - const Int width = *reinterpret_cast < const Int * >(recvHead); - recvHead += sizeof (Int); - - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int numEntries = localHeight * localWidth; + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError("Submatrix out of bounds of global matrix"); + + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; - const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - /* - const Int index = replyVectors_[source].size(); - replyVectors_[source][0].resize ( bufferSize ); - for (Int i = 0; i < index; ++i) - replyVectors_[source][i].resize ( bufferSize ); - replyVectors_[source].resize (index + 1); - replyVectors_[source][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; - */ - const Int index = 0; - replyVectors_[source].resize (index + 1); - replyVectors_[source][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; -#else - const Int index = ReadyForSend (bufferSize, replyVectors_[source], - replySendRequests_[source], - sendingReply_[source]); -#endif - // Pack the reply header - byte *sendBuffer = replyVectors_[source][index].data (); - byte *sendHead = sendBuffer; - *reinterpret_cast < Int * >(sendHead) = myRow; - sendHead += sizeof (Int); - *reinterpret_cast < Int * >(sendHead) = myCol; - sendHead += sizeof (Int); - - // Pack the payload - T *sendData = reinterpret_cast < T * >(sendHead); - for (Int t = 0; t < localWidth; ++t) - { - T *sendCol = &sendData[t * localHeight]; - const T *XCol = X.LockedBuffer (iLocalOffset, jLocalOffset + t); - MemCopy (sendCol, XCol, localHeight); - } - // Fire off non-blocking send -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - mpi::TaggedISSend - (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), - dummy_request); - // nonblocking ssends must have been issued - all_sends_are_finished = '1'; -#else - mpi::TaggedISSend - (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), - replySendRequests_[source][index]); -#endif - } - } + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + const Int XLDim = X.LDim (); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step(getVectors_[source][index].data ()); + // get request + mpi::TaggedRecv (getBuffer, numEntries, source, + DATA_LCC_TAG, g.VCComm ()); + // update local matrix + for( Int t=0; t -AxpyInterface::AxpyInterface() -: attachedForLocalToGlobal_(false), attachedForGlobalToLocal_(false), - localToGlobalMat_(0), globalToLocalMat_(0), - sendDummy_(0), recvDummy_(0) -{ } +Int AxpyInterface2::GetIndexForMatrix ( Matrix& Z, const Int rank ) +{ + typename std::deque::iterator dit; + dit = std::find ( matrixBase_[rank].begin(), + matrixBase_[rank].end(), Z.LockedBuffer ()); + const Int index = (dit - matrixBase_[rank].begin()); + assert (index != matrixBase_[rank].size ()); + + return index; +} +// get operation associated with a matrix +// operation direction will be same for all ranks template -AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) -: sendDummy_(0), recvDummy_(0) +Int AxpyInterface2::GetMatrixType ( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) - if( type == LOCAL_TO_GLOBAL ) - { - attachedForLocalToGlobal_ = true; - attachedForGlobalToLocal_ = false; - localToGlobalMat_ = &Z; - globalToLocalMat_ = 0; - } - else + const Int index = GetIndexForMatrix ( Z, 0 ); + return opKind_[0][index]; +} + +// progress communication for a particular matrix +// this could be used to progress sends and recvs +template +void AxpyInterface2::ProgressMatrix ( Matrix& Z, const Int rank ) +{ + const Int index = GetIndexForMatrix ( Z, rank ); + if ( !dataRequestStatuses_[rank][index] ) // nothing to do + return; + // wait + mpi::Wait ( dataRequests_[rank][index] ); + dataRequestStatuses_[rank][index] = false; + getVectors_[rank][index].resize (0); + putVectors_[rank][index].resize (0); +} + +// local matrix could be updated after local flush +template +void AxpyInterface2::LocalFlush( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::LocalFlush")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + // local flush has no meaning for global to local + // transfers + Int type = GetMatrixType ( Z ); + if ( type == DATA_GET_TAG || type == DATA_LCC_TAG ) { - attachedForLocalToGlobal_ = false; - attachedForGlobalToLocal_ = true; - localToGlobalMat_ = 0; - globalToLocalMat_ = &Z; + Flush ( Z, i, j ); + return; } - const Int p = Z.Grid().Size(); - -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - sentEomTo_.resize( p, false ); - haveEomFrom_.resize( p, false ); - - sendingData_.resize( p ); - sendingRequest_.resize( p ); - sendingReply_.resize( p ); + DistMatrix& Y = *GlobalArrayPut_; - dataSendRequests_.resize( p ); - requestSendRequests_.resize( p ); - replySendRequests_.resize( p ); + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; - eomSendRequests_.resize( p ); -#endif + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step -AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) -: sendDummy_(0), recvDummy_(0) +void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) - if( type == LOCAL_TO_GLOBAL ) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + // ensure local completion for local to + // global transfers + Int type = GetMatrixType ( Z ); + switch ( type ) { - LogicError("Cannot update a constant matrix"); + case DATA_PUT_TAG: + { + LocalFlush ( Z, i, j ); + HandleLocalToGlobalData ( Z, i, j ); + break; + } + case DATA_ACC_TAG: + { + LocalFlush ( Z, i, j ); + HandleLocalToGlobalAcc ( Z, i, j ); + break; + } + case DATA_GET_TAG: + { + HandleGlobalToLocalData ( Z, i, j ); + break; + } + case DATA_LCC_TAG: + { + HandleGlobalToLocalAcc ( Z, i, j ); + break; + } + } - else - { - attachedForLocalToGlobal_ = false; - attachedForGlobalToLocal_ = true; - localToGlobalMat_ = 0; - globalToLocalMat_ = &X; - } - - const Int p = X.Grid ().Size (); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - sentEomTo_.resize (p, false); - haveEomFrom_.resize (p, false); - - sendingData_.resize (p); - sendingRequest_.resize (p); - sendingReply_.resize (p); - - dataSendRequests_.resize (p); - requestSendRequests_.resize (p); - replySendRequests_.resize (p); - - eomSendRequests_.resize (p); -#endif - - dataVectors_.resize (p); - requestVectors_.resize (p); - replyVectors_.resize (p); - } - - template < typename T > AxpyInterface < T >::~AxpyInterface () - { - if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) - { - if (std::uncaught_exception ()) - { - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); - std::ostringstream os; - os << g.Rank () - << - "Uncaught exception detected during AxpyInterface destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << std::endl; - std::cerr << os.str (); - DEBUG_ONLY (DumpCallStack ())} - else - { - Detach (); - } - } - } - - template < typename T > - void AxpyInterface < T >::Attach (AxpyType type, DistMatrix < T > &Z) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Attach")) - if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) - LogicError ("Must detach before reattaching."); - - const Grid & g = Z.Grid (); - - if (type == LOCAL_TO_GLOBAL) - { - attachedForLocalToGlobal_ = true; - localToGlobalMat_ = &Z; - } - else - { - attachedForGlobalToLocal_ = true; - globalToLocalMat_ = &Z; - } -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = '0'; -#endif - const Int p = Z.Grid ().Size (); - -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - // eom - sentEomTo_.resize (p, false); - haveEomFrom_.resize (p, false); - // request objects - sendingRequest_.resize (p); - dataSendRequests_.resize (p); - eomSendRequests_.resize (p); - // ready-to-send - requestSendRequests_.resize (p); - replySendRequests_.resize (p); - sendingData_.resize (p); - sendingReply_.resize (p); -#endif - - dataVectors_.resize (p); - requestVectors_.resize (p); - replyVectors_.resize (p); - } - - template < typename T > - void AxpyInterface < T >::Attach (AxpyType type, - const DistMatrix < T > &X) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Attach")) - if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) - LogicError ("Must detach before reattaching."); - - if (type == LOCAL_TO_GLOBAL) - { - LogicError ("Cannot update a constant matrix"); - } - else - { - attachedForGlobalToLocal_ = true; - globalToLocalMat_ = &X; - } -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = '0'; -#endif - const Int p = X.Grid ().Size (); - -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - // eom - sentEomTo_.resize (p, false); - haveEomFrom_.resize (p, false); - // request objects - dataSendRequests_.resize (p); - requestSendRequests_.resize (p); - replySendRequests_.resize (p); - eomSendRequests_.resize (p); - // ready-to-send - sendingRequest_.resize (p); - sendingData_.resize (p); - sendingReply_.resize (p); -#endif - - dataVectors_.resize (p); - replyVectors_.resize (p); - requestVectors_.resize (p); - } - - template < typename T > - void AxpyInterface < T >::Axpy (T alpha, Matrix < T > &Z, Int i, Int j) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Axpy")) - if (attachedForLocalToGlobal_) - AxpyLocalToGlobal (alpha, Z, i, j); - else if (attachedForGlobalToLocal_) - AxpyGlobalToLocal (alpha, Z, i, j); - else - LogicError ("Cannot axpy before attaching."); - } - - template < typename T > - void AxpyInterface < T >::Axpy (T alpha, const Matrix < T > &Z, Int i, - Int j) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Axpy")) - if (attachedForLocalToGlobal_) - AxpyLocalToGlobal (alpha, Z, i, j); - else if (attachedForGlobalToLocal_) - LogicError ("Cannot update a constant matrix."); - else - LogicError ("Cannot axpy before attaching."); - } - -// Update Y(i:i+height-1,j:j+width-1) += alpha X, where X is height x width - template < typename T > - void AxpyInterface < T >::AxpyLocalToGlobal - (T alpha, const Matrix < T > &X, Int i, Int j) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyLocalToGlobal")) - DistMatrix < T > &Y = *localToGlobalMat_; - if (i < 0 || j < 0) - LogicError ("Submatrix offsets must be non-negative"); - if (i + X.Height () > Y.Height () || j + X.Width () > Y.Width ()) - LogicError ("Submatrix out of bounds of global matrix"); - - const Grid & g = Y.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - const Int myProcessRow = g.Row (); - const Int myProcessCol = g.Col (); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; +} + +template +void AxpyInterface2::Flush( Matrix& Z ) +{ + + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) + Flush ( Z, 0, 0 ); +} - const Int height = X.Height (); - const Int width = X.Width (); + template < typename T > +void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForGet_ ) + LogicError("Local matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + std::vector < std::vector > putVector; + putVector.resize (p); + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - for (Int step = 0; step < p; ++step) - { - const Int colShift = Shift (receivingRow, colAlign, r); - const Int rowShift = Shift (receivingCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int numEntries = localHeight * localWidth; - if (numEntries != 0) - { - const Int destination = receivingRow + r * receivingCol; - const Int bufferSize = - 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - /* - const Int index = dataVectors_[destination].size(); - for (Int i = 0; i < index; ++i) - dataVectors_[destination][i].resize ( bufferSize ); - dataVectors_[destination].resize (index + 1); - dataVectors_[destination][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; - */ - const Int index = 0; - dataVectors_[destination].resize (index + 1); - dataVectors_[destination][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; -#else - const Int index = - ReadyForSend (bufferSize, dataVectors_[destination], - dataSendRequests_[destination], - sendingData_[destination]); -#endif - DEBUG_ONLY (if - (Int (dataVectors_[destination][index].size ()) != - bufferSize) LogicError ("Error in ReadyForSend");) - // Pack the header - byte *sendBuffer = dataVectors_[destination][index].data (); - byte *head = sendBuffer; - *reinterpret_cast < Int * >(head) = i; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = j; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = height; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = width; - head += sizeof (Int); - *reinterpret_cast < T * >(head) = alpha; - head += sizeof (T); - - // Pack the payload - T *sendData = reinterpret_cast < T * >(head); - const T *XBuffer = X.LockedBuffer (); - const Int XLDim = X.LDim (); - for (Int t = 0; t < localWidth; ++t) - { - T *thisSendCol = &sendData[t * localHeight]; - const T *thisXCol = &XBuffer[(rowShift + t * c) * XLDim]; - for (Int s = 0; s < localHeight; ++s) - thisSendCol[s] = thisXCol[colShift + s * r]; - } - // Fire off the non-blocking send -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - mpi::TaggedISSend - (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), - dummy_request); -#else - mpi::TaggedISSend - (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), - dataSendRequests_[destination][index]); -#endif - } - receivingRow = (receivingRow + 1) % r; - if (receivingRow == 0) - receivingCol = (receivingCol + 1) % c; - } - } - -// Update Y += alpha X(i:i+height-1,j:j+width-1), where X is the dist-matrix - template < typename T > - void AxpyInterface < T >::AxpyGlobalToLocal (T alpha, Matrix < T > &Y, - Int i, Int j) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyGlobalToLocal")) - const DistMatrix < T > &X = *globalToLocalMat_; - - const Int height = Y.Height (); - const Int width = Y.Width (); - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid AxpyGlobalToLocal submatrix"); + const Int YLDim = Y.LDim (); - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); + for( Int step=0; step=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - /* - const Int index = requestVectors_[rank].size(); - for (Int i = 0; i < index; ++i) - requestVectors_[rank][i].resize ( bufferSize ); - requestVectors_[rank].resize (index + 1); - requestVectors_[rank][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; - */ - const Int index = 0; - requestVectors_[rank].resize (index + 1); - requestVectors_[rank][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; -#else - const Int index = ReadyForSend (bufferSize, requestVectors_[rank], - requestSendRequests_[rank], - sendingRequest_[rank]); -#endif - // Copy the request header into the send buffer - byte *sendBuffer = requestVectors_[rank][index].data (); - byte *head = sendBuffer; - *reinterpret_cast < Int * >(head) = i; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = j; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = height; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = width; - head += sizeof (Int); - - // Begin the non-blocking send -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - mpi::TaggedISSend - (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), - dummy_request); -#else - mpi::TaggedISSend - (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), - requestSendRequests_[rank][index]); -#endif - } - // Receive all of the replies - Int numReplies = 0; - while (numReplies < p) - { - HandleGlobalToLocalRequest (); - mpi::Status status; - - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - - // Ensure that we have a recv buffer - const Int count = mpi::GetCount < byte > (status); - recvVector_.resize (count); - byte *recvBuffer = recvVector_.data (); - - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm ()); - - // Unpack the reply header - const byte *head = recvBuffer; - const Int row = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int col = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const T *recvData = reinterpret_cast < const T * >(head); - - // Compute the local heights and offsets - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (row, colAlign, r); - const Int rowShift = Shift (col, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (0, rowShift + t * c); - const T *XCol = &recvData[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] += alpha * XCol[s]; - } - ++numReplies; - } - } - } - -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - template < typename T > - Int AxpyInterface < T >::ReadyForSend - (Int sendSize, - std::deque < std::vector < byte >> &sendVectors, - std::deque < mpi::Request > &requests, - std::deque < bool > &requestStatuses) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::ReadyForSend")) - const Int numCreated = sendVectors.size (); - DEBUG_ONLY (if (numCreated != Int (requests.size ()) || - numCreated != - Int (requestStatuses.size ()))LogicError - ("size mismatch");) - for (Int i = 0; i < numCreated; ++i) + if( numEntries != 0 ) { - // If this request is still running, test to see if it finished. - if (requestStatuses[i]) + const Int destination = receivingRow + r*receivingCol; + + putVector[destination].resize( numEntries ); + T* sendBuffer = putVector[destination].data(); + T* XBuffer = Z.Buffer(); + + for( Int t=0; t +void AxpyInterface2::HandleGlobalToLocalAcc ( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalAcc")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForGet_ ) + LogicError("Local matrix cannot be updated"); + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + std::vector < std::vector > putVector; + putVector.resize (p); + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim (); + + for( Int step=0; step void AxpyInterface < T >::UpdateRequestStatuses () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } +} + +template < typename T > +void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + const DistMatrix &X = *GlobalArrayPut_; + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); const Int p = g.Size (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + std::vector < std::vector > getVector; + getVector.resize (p); - for (Int i = 0; i < p; ++i) - { - const Int numDataSendRequests = dataSendRequests_[i].size (); - for (Int j = 0; j < numDataSendRequests; ++j) - if (sendingData_[i][j]) - sendingData_[i][j] = !mpi::Test (dataSendRequests_[i][j]); - const Int numRequestSendRequests = requestSendRequests_[i].size (); - for (Int j = 0; j < numRequestSendRequests; ++j) - if (sendingRequest_[i][j]) - sendingRequest_[i][j] = !mpi::Test (requestSendRequests_[i][j]); - const Int numReplySendRequests = replySendRequests_[i].size (); - for (Int j = 0; j < numReplySendRequests; ++j) - if (sendingReply_[i][j]) - sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); - } - } -#endif // NO USE OF THESE P2P SYNC FUNCTIONS WHEN NBC IS ACTIVE - - template < typename T > void AxpyInterface < T >::Detach () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Detach")) - if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) - LogicError ("Must attach before detaching."); - - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : globalToLocalMat_-> - Grid ()); + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError("Submatrix out of bounds of global matrix"); + + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; + const Int XLDim = X.LDim (); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; - if (attachedForLocalToGlobal_) + for( Int step=0; step=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; - // nonblocking ssends must have been issued - all_sends_are_finished = '1'; - // spin till all messages sent have been - // received - while (!DONE) + const Int colShift = Shift( receivingRow, colAlign, r ); + const Int rowShift = Shift( receivingCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int numEntries = localHeight * localWidth; + + if( numEntries != 0 ) { - // probes for incoming message and - // receive - HandleLocalToGlobalData (); - - if (nb_bar_active) - { - // test/wait for IBarrier completion - DONE = mpi::Test (nb_bar_request); - } - else + const Int destination = receivingRow + r*receivingCol; + + getVector[destination].resize ( numEntries ); + T *getBuffer = getVector[destination].data (); + + mpi::TaggedRecv (getBuffer, numEntries, destination, + DATA_PUT_TAG, g.VCComm ()); + // update local matrix + for( Int t=0; t +void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + const DistMatrix &X = *GlobalArrayGet_; + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + std::vector < std::vector > getVector; + getVector.resize (p); + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError("Submatrix out of bounds of global matrix"); + + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; + + const Int XLDim = X.LDim (); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; - // spin till all messages sent have been - // received - while (!DONE) + const Int colShift = Shift( receivingRow, colAlign, r ); + const Int rowShift = Shift( receivingCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int numEntries = localHeight * localWidth; + + if( numEntries != 0 ) { - // probes for incoming message requests - // receives, and posts reply - HandleGlobalToLocalRequest (); - - if (nb_bar_active) - { - // test/wait for IBarrier completion - DONE = mpi::Test (nb_bar_request); - } - else + const Int destination = receivingRow + r*receivingCol; + + getVector[destination].resize ( numEntries ); + T *getBuffer = getVector[destination].data (); + + mpi::TaggedRecv (getBuffer, numEntries, destination, + DATA_ACC_TAG, g.VCComm ()); + // update local matrix + for( Int t=0; t=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = '0'; -#else - sentEomTo_.clear(); - haveEomFrom_.clear(); - - sendingData_.clear(); - sendingRequest_.clear(); - sendingReply_.clear(); -#endif - - dataVectors_.clear(); - requestVectors_.clear(); - replyVectors_.clear(); - -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else - dataSendRequests_.clear(); - requestSendRequests_.clear(); - replySendRequests_.clear(); - - eomSendRequests_.clear(); -#endif - } - - template class AxpyInterface < Int >; - template class AxpyInterface < float >; - template class AxpyInterface < double >; - template class AxpyInterface < Complex < float >>; - template class AxpyInterface < Complex < double >>; -} // namespace El + template +void AxpyInterface2::Detach() +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Detach")) + // destructor will call detach again... + if (detached_) + return; + if( !attached_ ) + LogicError("Must attach before detaching."); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + + mpi::Barrier( g.VCComm() ); + + attached_ = false; + detached_ = true; + toBeAttachedForPut_ = false; + toBeAttachedForGet_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + + putVectors_.clear(); + getVectors_.clear(); + + dataRequests_.clear(); + dataRequestStatuses_.clear(); + matrixBase_.clear(); + opKind_.clear(); +} + +template class AxpyInterface2; +template class AxpyInterface2; +template class AxpyInterface2; +template class AxpyInterface2>; +template class AxpyInterface2>; + +} // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 528ed84038..cd58656984 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -19,6 +19,7 @@ which can be found in the LICENSE file in the root directory, or at // TODO RMA related checks pending (e.g bounds checking)... // TODO Use DDT for put/get/acc when EL_USE_DERIVED_TYPE is defined // TODO Use DEBUG_ONLY or something that EL provides instead of assert +// TODO Add a logic in Flush to return immediately if Get is used #if MPI_VERSION>=3 namespace El { @@ -28,7 +29,8 @@ RmaInterface::RmaInterface() : GlobalArrayPut_(0), GlobalArrayGet_(0), putVector_(0), getVector_(0), window (MPI_WIN_NULL), toBeAttachedForPut_(false), toBeAttachedForGet_(false), - attached_(false), detached_(false) + attached_(false), detached_(false), preceeding_get_(false), + preceeding_put_(false) { } template @@ -40,6 +42,8 @@ RmaInterface::RmaInterface( DistMatrix& Z ) detached_ = false; toBeAttachedForGet_ = true; toBeAttachedForPut_ = true; + preceeding_put_ = false; + preceeding_get_ = false; GlobalArrayPut_ = &Z; GlobalArrayGet_ = &Z; window = MPI_WIN_NULL; @@ -58,6 +62,8 @@ RmaInterface::RmaInterface( const DistMatrix& X ) detached_ = false; toBeAttachedForGet_ = true; toBeAttachedForPut_ = false; + preceeding_put_ = false; + preceeding_get_ = false; GlobalArrayGet_ = &X; GlobalArrayPut_ = 0; window = MPI_WIN_NULL; From db50fa205a5d87674911c6c24ae1ef7e9102e042 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Sat, 9 Aug 2014 03:45:54 -0500 Subject: [PATCH 063/110] intermediate commits, got to fix nbc next, then start over with axpy2 --- include/El/core/RmaInterface.hpp | 7 ++-- src/core/AxpyInterface2.0.cpp | 3 +- src/core/RmaInterface.cpp | 60 ++++++++++++++++++++------------ 3 files changed, 45 insertions(+), 25 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index e324a2efa4..147f8d819b 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -49,7 +49,7 @@ class RmaInterface private: mpi::Window window; - std::vector> + std::vector>> getVector_, putVector_; DistMatrix* GlobalArrayPut_; @@ -58,7 +58,10 @@ class RmaInterface bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_, preceeding_put_, preceeding_get_; - }; + + Int NextIndex ( Int dataSize, + std::deque > &dataVectors ); +}; #endif //MPI-3 } // namespace El #endif // ifndef EL_RMAINTERFACE_HPP diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 475e280a57..44116cf94e 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -106,7 +106,8 @@ Int AxpyInterface2::NextIndex { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) const Int Index = Int(requests.size ()); - + + dataVectors.resize (Index + 1); dataVectors[Index].resize (dataSize); requests.push_back (mpi::REQUEST_NULL); requestStatus.push_back (true); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index cd58656984..38ebb86299 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -173,6 +173,20 @@ void RmaInterface::Attach( const DistMatrix& X ) mpi::WindowLock (window); } +template +Int RmaInterface::NextIndex +( Int dataSize, + std::deque > &dataVectors ) +{ + DEBUG_ONLY (CallStackEntry cse ("RmaInterface::NextIndex")) + const Int Index = Int(dataVectors.size ()); + + dataVectors.resize (Index + 1); + dataVectors[Index].resize (dataSize); + + return Index; +} + template void RmaInterface::Put( Matrix& Z, Int i, Int j ) { @@ -222,9 +236,10 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; + const Int index = RmaInterface::NextIndex ( numEntries, + putVector_[destination] ); - putVector_[destination].resize( numEntries ); - T* sendBuffer = putVector_[destination].data(); + T* sendBuffer = putVector_[destination][index].data(); T* sendData = reinterpret_cast(sendBuffer); T* XBuffer = Z.Buffer(); @@ -242,7 +257,7 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); // clear - putVector_[destination].resize (0); + putVector_[destination][index].resize (0); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -302,9 +317,9 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - putVector_[destination].resize( numEntries ); - T* sendBuffer = putVector_[destination].data(); + const Int index = RmaInterface::NextIndex ( numEntries, + putVector_[destination] ); + T* sendBuffer = putVector_[destination][index].data(); T* sendData = reinterpret_cast(sendBuffer); const T* XBuffer = Z.LockedBuffer(); @@ -322,7 +337,7 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); // clear - putVector_[destination].resize (0); + putVector_[destination][index].resize (0); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -383,9 +398,9 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - getVector_[destination].resize ( numEntries ); - T *getBuffer = getVector_[destination].data (); + const Int index = RmaInterface::NextIndex ( numEntries, + getVector_[destination] ); + T *getBuffer = getVector_[destination][index].data (); // get for( Int t=0; t::Get( Matrix& Z, Int i, Int j ) YCol[colShift+s*r] = XCol[s]; } // clear - getVector_[destination].resize (0); + getVector_[destination][index].resize (0); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -467,9 +482,10 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; + const Int index = RmaInterface::NextIndex ( numEntries, + putVector_[destination] ); - putVector_[destination].resize( numEntries ); - T* sendBuffer = putVector_[destination].data(); + T* sendBuffer = putVector_[destination][index].data(); T* sendData = reinterpret_cast(sendBuffer); T* XBuffer = Z.Buffer(); @@ -487,7 +503,7 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); // clear - putVector_[destination].resize (0); + putVector_[destination][index].resize (0); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -548,9 +564,9 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - putVector_[destination].resize( numEntries ); - T* sendBuffer = putVector_[destination].data(); + const Int index = RmaInterface::NextIndex ( numEntries, + putVector_[destination] ); + T* sendBuffer = putVector_[destination][index].data(); T* sendData = reinterpret_cast(sendBuffer); const T* XBuffer = Z.LockedBuffer(); @@ -568,7 +584,7 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); // clear - putVector_[destination].resize (0); + putVector_[destination][index].resize (0); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -631,9 +647,9 @@ void RmaInterface::LocalAcc( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - getVector_[destination].resize ( numEntries ); - T *getBuffer = getVector_[destination].data (); + const Int index = RmaInterface::NextIndex ( numEntries, + getVector_[destination] ); + T *getBuffer = getVector_[destination][index].data (); // get for( Int t=0; t::LocalAcc( Matrix& Z, Int i, Int j ) YCol[colShift+s*r] += XCol[s]; } // clear - getVector_[destination].resize (0); + getVector_[destination][index].resize (0); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) From 941a0543c41a2ad93d71df945b71de19977de292 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Sat, 9 Aug 2014 21:52:58 -0500 Subject: [PATCH 064/110] non blocking consensus is possible only for local to global in the current form without too much code change...so made the modification in orig axpyinterface, completely untested as of now --- include/El/core/AxpyInterface.hpp | 8 +- src/core/AxpyInterface.cpp | 163 ++++++------------------------ 2 files changed, 30 insertions(+), 141 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index 29473c3e29..ce8c30ab68 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -56,16 +56,13 @@ class AxpyInterface DistMatrix* localToGlobalMat_; const DistMatrix* globalToLocalMat_; -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else std::vector sentEomTo_, haveEomFrom_; std::vector eomSendRequests_; - + std::vector> sendingData_, sendingRequest_, sendingReply_; std::vector> dataSendRequests_, requestSendRequests_, replySendRequests_; -#endif std::vector recvVector_; std::vector>> @@ -73,8 +70,6 @@ class AxpyInterface byte sendDummy_, recvDummy_; -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else // Check if we are done with this attachment's work bool Finished(); // Progress functions @@ -88,7 +83,6 @@ class AxpyInterface std::deque>& sendVectors, std::deque& requests, std::deque& requestStatuses ); -#endif void HandleLocalToGlobalData(); void HandleGlobalToLocalRequest(); diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index d7bd600d50..2a01b27237 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -15,8 +15,6 @@ namespace El { -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else template < typename T > bool AxpyInterface < T >::Finished () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Finished"); @@ -99,8 +97,7 @@ namespace El haveEomFrom_[source] = true; } } -#endif // NO USE OF THESE P2P SYNC FUNCTIONS WHEN NBC IS ACTIVE - + template < typename T > void AxpyInterface < T >::HandleLocalToGlobalData () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) @@ -231,25 +228,9 @@ namespace El const Int numEntries = localHeight * localWidth; const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - /* - const Int index = replyVectors_[source].size(); - replyVectors_[source][0].resize ( bufferSize ); - for (Int i = 0; i < index; ++i) - replyVectors_[source][i].resize ( bufferSize ); - replyVectors_[source].resize (index + 1); - replyVectors_[source][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; - */ - const Int index = 0; - replyVectors_[source].resize (index + 1); - replyVectors_[source][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; -#else const Int index = ReadyForSend (bufferSize, replyVectors_[source], replySendRequests_[source], sendingReply_[source]); -#endif // Pack the reply header byte *sendBuffer = replyVectors_[source][index].data (); byte *sendHead = sendBuffer; @@ -267,17 +248,9 @@ namespace El MemCopy (sendCol, XCol, localHeight); } // Fire off non-blocking send -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - mpi::TaggedISSend - (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), - dummy_request); - // nonblocking ssends must have been issued - all_sends_are_finished = '1'; -#else mpi::TaggedISSend (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), replySendRequests_[source][index]); -#endif } } @@ -310,8 +283,6 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) const Int p = Z.Grid().Size(); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else sentEomTo_.resize( p, false ); haveEomFrom_.resize( p, false ); @@ -319,13 +290,12 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) sendingRequest_.resize( p ); sendingReply_.resize( p ); + eomSendRequests_.resize( p ); + dataSendRequests_.resize( p ); requestSendRequests_.resize( p ); replySendRequests_.resize( p ); - eomSendRequests_.resize( p ); -#endif - dataVectors_.resize( p ); requestVectors_.resize( p ); replyVectors_.resize( p ); @@ -349,8 +319,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } const Int p = X.Grid ().Size (); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else + sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); @@ -358,13 +327,12 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) sendingRequest_.resize (p); sendingReply_.resize (p); + eomSendRequests_.resize (p); + dataSendRequests_.resize (p); requestSendRequests_.resize (p); replySendRequests_.resize (p); - eomSendRequests_.resize (p); -#endif - dataVectors_.resize (p); requestVectors_.resize (p); replyVectors_.resize (p); @@ -421,22 +389,21 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) #endif const Int p = Z.Grid ().Size (); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else // eom sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); // request objects sendingRequest_.resize (p); - dataSendRequests_.resize (p); + sendingData_.resize (p); + sendingReply_.resize (p); + eomSendRequests_.resize (p); + // ready-to-send requestSendRequests_.resize (p); replySendRequests_.resize (p); - sendingData_.resize (p); - sendingReply_.resize (p); -#endif - + dataSendRequests_.resize (p); + dataVectors_.resize (p); requestVectors_.resize (p); replyVectors_.resize (p); @@ -464,21 +431,20 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) #endif const Int p = X.Grid ().Size (); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else // eom sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); - // request objects - dataSendRequests_.resize (p); - requestSendRequests_.resize (p); - replySendRequests_.resize (p); - eomSendRequests_.resize (p); // ready-to-send sendingRequest_.resize (p); sendingData_.resize (p); sendingReply_.resize (p); -#endif + + eomSendRequests_.resize (p); + + // ready-to-send + requestSendRequests_.resize (p); + replySendRequests_.resize (p); + dataSendRequests_.resize (p); dataVectors_.resize (p); replyVectors_.resize (p); @@ -549,25 +515,10 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Int destination = receivingRow + r * receivingCol; const Int bufferSize = 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - /* - const Int index = dataVectors_[destination].size(); - for (Int i = 0; i < index; ++i) - dataVectors_[destination][i].resize ( bufferSize ); - dataVectors_[destination].resize (index + 1); - dataVectors_[destination][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; - */ - const Int index = 0; - dataVectors_[destination].resize (index + 1); - dataVectors_[destination][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; -#else const Int index = ReadyForSend (bufferSize, dataVectors_[destination], dataSendRequests_[destination], sendingData_[destination]); -#endif DEBUG_ONLY (if (Int (dataVectors_[destination][index].size ()) != bufferSize) LogicError ("Error in ReadyForSend");) @@ -597,15 +548,9 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) thisSendCol[s] = thisXCol[colShift + s * r]; } // Fire off the non-blocking send -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - mpi::TaggedISSend - (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), - dummy_request); -#else mpi::TaggedISSend (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), dataSendRequests_[destination][index]); -#endif } receivingRow = (receivingRow + 1) % r; if (receivingRow == 0) @@ -635,24 +580,9 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) for (Int rank = 0; rank < p; ++rank) { const Int bufferSize = 4 * sizeof (Int); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - /* - const Int index = requestVectors_[rank].size(); - for (Int i = 0; i < index; ++i) - requestVectors_[rank][i].resize ( bufferSize ); - requestVectors_[rank].resize (index + 1); - requestVectors_[rank][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; - */ - const Int index = 0; - requestVectors_[rank].resize (index + 1); - requestVectors_[rank][index].resize ( bufferSize ); - mpi::Request dummy_request = mpi::REQUEST_NULL; -#else const Int index = ReadyForSend (bufferSize, requestVectors_[rank], requestSendRequests_[rank], sendingRequest_[rank]); -#endif // Copy the request header into the send buffer byte *sendBuffer = requestVectors_[rank][index].data (); byte *head = sendBuffer; @@ -666,15 +596,9 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) head += sizeof (Int); // Begin the non-blocking send -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - mpi::TaggedISSend - (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), - dummy_request); -#else mpi::TaggedISSend (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), requestSendRequests_[rank][index]); -#endif } // Receive all of the replies Int numReplies = 0; @@ -726,8 +650,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } } -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else template < typename T > Int AxpyInterface < T >::ReadyForSend (Int sendSize, @@ -764,7 +686,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) return numCreated; } - + template < typename T > void AxpyInterface < T >::UpdateRequestStatuses () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) @@ -789,7 +711,6 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); } } -#endif // NO USE OF THESE P2P SYNC FUNCTIONS WHEN NBC IS ACTIVE template < typename T > void AxpyInterface < T >::Detach () { @@ -800,6 +721,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Grid & g = (attachedForLocalToGlobal_ ? localToGlobalMat_->Grid () : globalToLocalMat_-> Grid ()); + const Int me = g.VCRank (); if (attachedForLocalToGlobal_) { @@ -807,7 +729,13 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; - // nonblocking ssends must have been issued + // progress my issends + for (int i = 0; i < dataSendRequests_[me].size(); i++) + { + mpi::Wait ( dataSendRequests_[me][i] ); + sendingData_[me][i] = false; + } + // nonblocking ssends must have been issued all_sends_are_finished = '1'; // spin till all messages sent have been // received @@ -843,41 +771,12 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } else { -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; - // spin till all messages sent have been - // received - while (!DONE) - { - // probes for incoming message requests - // receives, and posts reply - HandleGlobalToLocalRequest (); - - if (nb_bar_active) - { - // test/wait for IBarrier completion - DONE = mpi::Test (nb_bar_request); - } - else - { - if (all_sends_are_finished == '1') - { - // all ssends are complete, start nonblocking barrier - mpi::IBarrier (g.VCComm (), nb_bar_request); - nb_bar_active = true; - } - } - } -#else while (!Finished ()) { HandleGlobalToLocalRequest (); HandleEoms (); } mpi::Barrier (g.VCComm ()); -#endif } attachedForLocalToGlobal_ = false; @@ -885,27 +784,23 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) recvVector_.clear(); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) all_sends_are_finished = '0'; -#else +#endif sentEomTo_.clear(); haveEomFrom_.clear(); sendingData_.clear(); sendingRequest_.clear(); sendingReply_.clear(); -#endif dataVectors_.clear(); requestVectors_.clear(); replyVectors_.clear(); -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) -#else dataSendRequests_.clear(); requestSendRequests_.clear(); replySendRequests_.clear(); eomSendRequests_.clear(); -#endif } template class AxpyInterface < Int >; From 718ca7ebce47145783796e2356f460ee76efb72e Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 11 Aug 2014 00:30:54 -0500 Subject: [PATCH 065/110] intermediate commit, fixing send/recv --- include/El/core/AxpyInterface2.0.hpp | 4 +- src/core/AxpyInterface.cpp | 3 +- src/core/AxpyInterface2.0.cpp | 208 +++++++++++++++++---------- 3 files changed, 138 insertions(+), 77 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index a5cde92a68..8fde3e6ad8 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -69,8 +69,8 @@ class AxpyInterface2 Int GetIndexForMatrix ( Matrix& Z, const Int rank ); void ProgressMatrix ( Matrix& Z, const Int rank ); - Int GetMatrixType ( Matrix& Z ); - + Int GetMatrixType ( Matrix& Z, const Int rank ); + Int NextIndex (Int dataSize, std::deque> &dataVectors, std::deque &requests, diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 2a01b27237..04785bbbcc 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -732,8 +732,9 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) // progress my issends for (int i = 0; i < dataSendRequests_[me].size(); i++) { + if ( !mpi::Test ( dataSendRequests_[me][i] )) mpi::Wait ( dataSendRequests_[me][i] ); - sendingData_[me][i] = false; + sendingData_[me][i] = false; } // nonblocking ssends must have been issued all_sends_are_finished = '1'; diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 44116cf94e..11770c0384 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -188,7 +188,8 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) } // put request mpi::TaggedISSend (sendBuffer, numEntries, destination, - DATA_PUT_TAG, g.VCComm (), dataRequests_[destination][index]); + DATA_PUT_TAG, g.VCComm (), + dataRequests_[destination][index]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -339,7 +340,8 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) } // acc request mpi::TaggedISSend (sendBuffer, numEntries, destination, - DATA_ACC_TAG, g.VCComm (), dataRequests_[destination][index]); + DATA_ACC_TAG, g.VCComm (), + dataRequests_[destination][index]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -358,7 +360,7 @@ void AxpyInterface2::LocalAcc( Matrix& Z, Int i, Int j ) // the DistMatrix isn't attached if ( !toBeAttachedForGet_ ) LogicError ("Cannot perform this operation as matrix is not attached."); -if( i < 0 || j < 0 ) + if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); const DistMatrix &X = *GlobalArrayGet_; @@ -382,9 +384,6 @@ if( i < 0 || j < 0 ) const Int colAlign = (X.ColAlign() + i) % r; const Int rowAlign = (X.RowAlign() + j) % c; - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); - const Int XLDim = X.LDim (); Int receivingRow = myProcessRow; @@ -418,7 +417,7 @@ if( i < 0 || j < 0 ) // get request mpi::TaggedRecv (getBuffer, numEntries, source, DATA_LCC_TAG, g.VCComm ()); - // update local matrix + // acc to local matrix for( Int t=0; t::GetIndexForMatrix ( Matrix& Z, const Int rank ) dit = std::find ( matrixBase_[rank].begin(), matrixBase_[rank].end(), Z.LockedBuffer ()); const Int index = (dit - matrixBase_[rank].begin()); - assert (index != matrixBase_[rank].size ()); + //std::cout << "matrixBase size: " << matrixBase_[rank].size () << "\n"; + assert ( index != matrixBase_[rank].size () ); return index; } // get operation associated with a matrix -// operation direction will be same for all ranks template -Int AxpyInterface2::GetMatrixType ( Matrix& Z ) +Int AxpyInterface2::GetMatrixType ( Matrix& Z, const Int rank ) { - const Int index = GetIndexForMatrix ( Z, 0 ); - return opKind_[0][index]; + const Int index = GetIndexForMatrix ( Z, rank ); + return opKind_[rank][index]; } // progress communication for a particular matrix @@ -478,21 +477,13 @@ void AxpyInterface2::LocalFlush( Matrix& Z, Int i, Int j ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - // local flush has no meaning for global to local - // transfers - Int type = GetMatrixType ( Z ); - if ( type == DATA_GET_TAG || type == DATA_LCC_TAG ) - { - Flush ( Z, i, j ); - return; - } - DistMatrix& Y = *GlobalArrayPut_; const Grid& g = Y.Grid(); const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); + const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; @@ -516,7 +507,14 @@ void AxpyInterface2::LocalFlush( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - ProgressMatrix ( Z, destination ); + // local flush has no meaning for global to local + Int type = GetMatrixType ( Z, destination ); + if ( type == DATA_GET_TAG || type == DATA_LCC_TAG ) + { + Flush ( Z, i, j ); + return; + } + ProgressMatrix ( Z, destination ); } receivingRow = (receivingRow + 1) % r; @@ -528,47 +526,80 @@ void AxpyInterface2::LocalFlush( Matrix& Z, Int i, Int j ) // flush ensures local and remote completion // this interface assumes a send has been issued // and will post a matching receive and progress -template + template void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - // ensure local completion for local to - // global transfers - Int type = GetMatrixType ( Z ); - switch ( type ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + for( Int step=0; step void AxpyInterface2::Flush( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) Flush ( Z, 0, 0 ); } @@ -592,8 +623,6 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - std::vector < std::vector > putVector; - putVector.resize (p); const Int XLDim = Z.LDim(); // local matrix width and height @@ -617,10 +646,19 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - putVector[destination].resize( numEntries ); - T* sendBuffer = putVector[destination].data(); - T* XBuffer = Z.Buffer(); + T* XBuffer = Z.Buffer(); + const Int index = + NextIndex (numEntries, putVectors_[destination], + dataRequests_[destination], + dataRequestStatuses_[destination], + opKind_[destination], + DATA_PUT_TAG, + matrixBase_[destination], + XBuffer); + DEBUG_ONLY (if + (Int (putVectors_[destination][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + T *sendBuffer = putVectors_[destination][index].data (); for( Int t=0; t::HandleGlobalToLocalAcc ( Matrix& Z, Int i, Int j ) const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - std::vector < std::vector > putVector; - putVector.resize (p); const Int XLDim = Z.LDim(); // local matrix width and height @@ -685,11 +721,19 @@ void AxpyInterface2::HandleGlobalToLocalAcc ( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - putVector[destination].resize( numEntries ); - T* sendBuffer = putVector[destination].data(); - T* XBuffer = Z.Buffer(); - + T* XBuffer = Z.Buffer(); + const Int index = + NextIndex (numEntries, putVectors_[destination], + dataRequests_[destination], + dataRequestStatuses_[destination], + opKind_[destination], + DATA_PUT_TAG, + matrixBase_[destination], + XBuffer); + DEBUG_ONLY (if + (Int (putVectors_[destination][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + T *sendBuffer = putVectors_[destination][index].data (); for( Int t=0; t::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - getVector[destination].resize ( numEntries ); - T *getBuffer = getVector[destination].data (); + T* XBuffer = Z.Buffer(); + const Int index = + NextIndex (numEntries, putVectors_[destination], + dataRequests_[destination], + dataRequestStatuses_[destination], + opKind_[destination], + DATA_GET_TAG, + matrixBase_[destination], + XBuffer); + DEBUG_ONLY (if + (Int (getVectors_[destination][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + T *getBuffer = getVectors_[destination][index].data (); mpi::TaggedRecv (getBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm ()); @@ -772,7 +826,6 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j ) YCol[colShift+s*r] = XCol[s]; } // clear - getVector[destination].resize (0); ProgressMatrix ( Z, destination ); } receivingRow = (receivingRow + 1) % r; @@ -801,8 +854,6 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j ) const Int myCol = g.Col (); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - std::vector < std::vector > getVector; - getVector.resize (p); // local width and height const Int height = Z.Height(); @@ -830,9 +881,19 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - getVector[destination].resize ( numEntries ); - T *getBuffer = getVector[destination].data (); + T* XBuffer = Z.Buffer(); + const Int index = + NextIndex (numEntries, putVectors_[destination], + dataRequests_[destination], + dataRequestStatuses_[destination], + opKind_[destination], + DATA_GET_TAG, + matrixBase_[destination], + XBuffer); + DEBUG_ONLY (if + (Int (getVectors_[destination][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + T *getBuffer = getVectors_[destination][index].data (); mpi::TaggedRecv (getBuffer, numEntries, destination, DATA_ACC_TAG, g.VCComm ()); @@ -845,7 +906,6 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j ) YCol[colShift+s*r] += XCol[s]; } // clear - getVector[destination].resize (0); ProgressMatrix ( Z, destination ); } receivingRow = (receivingRow + 1) % r; From 5dc36975ce4da2cdb4bd357c2f23f46fdab6f2e7 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 12 Aug 2014 00:34:37 -0500 Subject: [PATCH 066/110] fixing axpy 2.0... --- include/El/core/AxpyInterface2.0.hpp | 55 +- src/core/AxpyInterface2.0.cpp | 797 ++++++++------------------- 2 files changed, 270 insertions(+), 582 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 8fde3e6ad8..e112d1beba 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -28,7 +28,6 @@ class AxpyInterface2 void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); - void LocalAcc( Matrix& Z, Int i, Int j ); void Flush( Matrix& Z, Int i, Int j ); void Flush( const Matrix& Z, Int i, Int j ); @@ -43,47 +42,45 @@ class AxpyInterface2 private: static const Int - DATA_PUT_TAG =1, - DATA_GET_TAG =2, - DATA_ACC_TAG =3, - DATA_LCC_TAG =4; + DATA_PUT_TAG =1, + DATA_GET_TAG =2, + DATA_ACC_TAG =3, + REQUEST_GET_TAG =4; /* Meta */ std::vector> - dataRequests_; + dataRequests_, requestRequests_; std::vector> - dataRequestStatuses_; - std::vector> - matrixBase_; - std::vector> - opKind_; + dataRequestStatuses_, + requestRequestStatuses_; + std::vector> + matrixBase_; /* Data */ std::vector>> getVectors_, putVectors_; DistMatrix* GlobalArrayPut_; DistMatrix* GlobalArrayGet_; - + bool toBeAttachedForPut_, toBeAttachedForGet_, - attached_, detached_, sends_complete_; + attached_, detached_; - Int GetIndexForMatrix ( Matrix& Z, const Int rank ); - void ProgressMatrix ( Matrix& Z, const Int rank ); - Int GetMatrixType ( Matrix& Z, const Int rank ); + Int NextIndex ( Int dataSize, + std::deque > &dataVectors, + std::deque &requests, + std::deque &requestStatus, + std::deque &matrixBase, + T * base_address ); - Int NextIndex (Int dataSize, - std::deque> &dataVectors, - std::deque &requests, - std::deque &requestStatus, - std::deque &opKind, - Int op, - std::deque &matrixBase, - T * base); - - void HandleLocalToGlobalData( Matrix& Z, Int i, Int j ); - void HandleGlobalToLocalData( Matrix& Z, Int i, Int j ); - void HandleLocalToGlobalAcc( Matrix& Z, Int i, Int j ); - void HandleGlobalToLocalAcc( Matrix& Z, Int i, Int j ); + Int GetIndexForMatrix ( Matrix& Z, int rank ); + void ProgressMatrix ( Matrix& Z, int rank ); + + void HandleLocalToGlobalData( Matrix& Z, Int i, Int j, + Int count, Int source ); + void HandleGlobalToLocalData( Matrix& Z, Int i, Int j, + Int count, Int source ); + void HandleLocalToGlobalAcc( Matrix& Z, Int i, Int j, + Int count, Int source ); }; } // namespace El #endif // ifndef EL_AXPYINTERFACE2_HPP diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 11770c0384..4112530873 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -7,15 +7,18 @@ which can be found in the LICENSE file in the root directory, or at #include // TODO Use DDT for put/get/acc when EL_USE_DERIVED_TYPE is defined +// TODO bring back const interfaces +// TODO localflush +// TODO error checks namespace El { template AxpyInterface2::AxpyInterface2() : GlobalArrayPut_(0), GlobalArrayGet_(0), - putVectors_(0), getVectors_(0), dataRequests_(0), - dataRequestStatuses_(0), matrixBase_(0), opKind_(0), - toBeAttachedForPut_(false), toBeAttachedForGet_(false), - attached_(false), detached_(false) + putVectors_(0), getVectors_(0), dataRequests_(0), matrixBase_(0), + dataRequestStatuses_(0), toBeAttachedForPut_(false), + toBeAttachedForGet_(false), attached_(false), + detached_(false) { } template @@ -34,9 +37,10 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) putVectors_.resize( p ); getVectors_.resize( p ); dataRequests_.resize (p); - dataRequestStatuses_.resize (p); + dataRequestStatuses_.resize (p); + requestRequests_.resize (p); + requestRequestStatuses_.resize (p); matrixBase_.resize (p); - opKind_.resize (p); } template @@ -85,12 +89,13 @@ void AxpyInterface2::Attach( DistMatrix& Z ) if (putVectors_.size() != p) { - getVectors_.resize( p ); - putVectors_.resize( p ); + getVectors_.resize( p ); + putVectors_.resize( p ); dataRequests_.resize (p); dataRequestStatuses_.resize (p); matrixBase_.resize (p); - opKind_.resize (p); + requestRequests_.resize (p); + requestRequestStatuses_.resize (p); } } @@ -99,10 +104,8 @@ Int AxpyInterface2::NextIndex ( Int dataSize, std::deque > &dataVectors, std::deque &requests, std::deque &requestStatus, - std::deque &opKind, - Int op, std::deque &matrixBase, - T * base_address) + T * base_address ) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) const Int Index = Int(requests.size ()); @@ -111,7 +114,6 @@ Int AxpyInterface2::NextIndex dataVectors[Index].resize (dataSize); requests.push_back (mpi::REQUEST_NULL); requestStatus.push_back (true); - opKind.push_back (op); // stores Matrix base address by index matrixBase.push_back (base_address); @@ -169,9 +171,7 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) NextIndex (numEntries, putVectors_[destination], dataRequests_[destination], dataRequestStatuses_[destination], - opKind_[destination], - DATA_PUT_TAG, - matrixBase_[destination], + matrixBase_[destination], XBuffer); DEBUG_ONLY (if @@ -201,70 +201,86 @@ template void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - - const DistMatrix &X = *GlobalArrayGet_; + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + DistMatrix& X = *GlobalArrayGet_; + + const Int height = X.Height (); + const Int width = X.Width (); + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid AxpyGlobalToLocal submatrix"); const Grid & g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int p = g.Size (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + std::vector recvVector_; - if (i + height > X.Height () || j + width > X.Width ()) - LogicError("Submatrix out of bounds of global matrix"); - - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - - const Int XLDim = X.LDim (); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; + T* XBuffer = Z.Buffer(); + // Send out the requests to all processes in the grid + // 0 byte send, no need to cancel or post matching + // receive + for (Int rank = 0; rank < p; ++rank) + { + const Int bufferSize = 0; + const Int index = + NextIndex (0, putVectors_[rank], + requestRequests_[rank], + requestRequestStatuses_[rank], + matrixBase_[rank], + XBuffer); - for( Int step=0; step(getVectors_[source][index].data ()); - // get request - mpi::TaggedIRecv (getBuffer, numEntries, source, DATA_GET_TAG, - g.VCComm (), dataRequests_[source][index]); + if (mpi::IProbe + (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) + { + const Int source = status.MPI_SOURCE; + + // Ensure that we have a recv buffer + const Int count = mpi::GetCount < byte > (status); + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + + // Compute the local heights and offsets + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = X.Buffer (0, rowShift + t * c); + const T *XCol = &recvBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] = XCol[s]; + } + ++numReplies; + recvVector_.resize ( 0 ); } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; } } @@ -321,8 +337,6 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) NextIndex (numEntries, putVectors_[destination], dataRequests_[destination], dataRequestStatuses_[destination], - opKind_[destination], - DATA_ACC_TAG, matrixBase_[destination], XBuffer); @@ -349,92 +363,9 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) } } -// local accumulate, Z += Get Y(i:i+height-1,j:j+width-1), -// where Z is local matrix height x width +// get index of a matrix for a particular process template -void AxpyInterface2::LocalAcc( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::LocalAcc")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - - const DistMatrix &X = *GlobalArrayGet_; - - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - if (i + height > X.Height () || j + width > X.Width ()) - LogicError("Submatrix out of bounds of global matrix"); - - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - - const Int XLDim = X.LDim (); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - - for( Int step=0; step(getVectors_[source][index].data ()); - // get request - mpi::TaggedRecv (getBuffer, numEntries, source, - DATA_LCC_TAG, g.VCComm ()); - // acc to local matrix - for( Int t=0; t -Int AxpyInterface2::GetIndexForMatrix ( Matrix& Z, const Int rank ) +Int AxpyInterface2::GetIndexForMatrix ( Matrix& Z, int rank ) { typename std::deque::iterator dit; dit = std::find ( matrixBase_[rank].begin(), @@ -442,24 +373,20 @@ Int AxpyInterface2::GetIndexForMatrix ( Matrix& Z, const Int rank ) const Int index = (dit - matrixBase_[rank].begin()); //std::cout << "matrixBase size: " << matrixBase_[rank].size () << "\n"; assert ( index != matrixBase_[rank].size () ); - + return index; } -// get operation associated with a matrix -template -Int AxpyInterface2::GetMatrixType ( Matrix& Z, const Int rank ) -{ - const Int index = GetIndexForMatrix ( Z, rank ); - return opKind_[rank][index]; -} - // progress communication for a particular matrix // this could be used to progress sends and recvs template -void AxpyInterface2::ProgressMatrix ( Matrix& Z, const Int rank ) +void AxpyInterface2::ProgressMatrix ( Matrix& Z, int rank ) { + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); const Int index = GetIndexForMatrix ( Z, rank ); + if ( !dataRequestStatuses_[rank][index] ) // nothing to do return; // wait @@ -469,131 +396,43 @@ void AxpyInterface2::ProgressMatrix ( Matrix& Z, const Int rank ) putVectors_[rank][index].resize (0); } -// local matrix could be updated after local flush -template -void AxpyInterface2::LocalFlush( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::LocalFlush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - - DistMatrix& Y = *GlobalArrayPut_; - - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step +template void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - - DistMatrix& Y = *GlobalArrayPut_; + + DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + mpi::Status status; - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step (status); + HandleLocalToGlobalData ( Z, i, j, count, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + const Int count = mpi::GetCount (status); + HandleLocalToGlobalAcc ( Z, i, j, count, status.MPI_SOURCE ); + break; + } + case REQUEST_GET_TAG: + { + const Int count = mpi::GetCount (status); + HandleGlobalToLocalData ( Z, i, j, count, status.MPI_SOURCE ); + break; + } } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; } } @@ -604,91 +443,115 @@ void AxpyInterface2::Flush( Matrix& Z ) Flush ( Z, 0, 0 ); } - template < typename T > -void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) +template < typename T > +void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j, + Int count, Int source ) { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if ( !toBeAttachedForGet_ ) - LogicError("Local matrix cannot be updated"); - - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - - const Int YLDim = Y.LDim (); - - for( Int step=0; step &Y = *GlobalArrayPut_; + const Grid & g = Y.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + int height = Z.Height(); + int width = Z.Width(); + std::vector getVector_; + + DEBUG_ONLY (if (count < Int (sizeof (T))) + LogicError ("Count was too small");) + T* Buffer = Z.Buffer(); + getVector_.resize (count); + DEBUG_ONLY (if + (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) + + T *getBuffer = getVector_.data (); + mpi::TaggedRecv (getBuffer, count, source, DATA_PUT_TAG, g.VCComm ()); + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) { - const Int colShift = Shift( receivingRow, colAlign, r ); - const Int rowShift = Shift( receivingCol, rowAlign, c ); - // number of entries in my PE - const Int localHeight = Length( height, colShift, r ); - const Int localWidth = Length( width, rowShift, c ); - const Int numEntries = localHeight * localWidth; - - if( numEntries != 0 ) - { - const Int destination = receivingRow + r*receivingCol; - T* XBuffer = Z.Buffer(); - const Int index = - NextIndex (numEntries, putVectors_[destination], - dataRequests_[destination], - dataRequestStatuses_[destination], - opKind_[destination], - DATA_PUT_TAG, - matrixBase_[destination], - XBuffer); - DEBUG_ONLY (if - (Int (putVectors_[destination][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = putVectors_[destination][index].data (); + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] = XCol[s]; + } + getVector_.resize ( 0 ); + // Free the memory + //ProgressMatrix ( Z, source ); +} - for( Int t=0; t +void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j, + Int count, Int source ) +{ + DistMatrix &Y = *GlobalArrayPut_; + const Grid & g = Y.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + int height = Z.Height(); + int width = Z.Width(); + std::vector getVector_; + + DEBUG_ONLY (if (count < Int (sizeof (T))) + LogicError ("Count was too small");) + T* Buffer = Z.Buffer(); + getVector_.resize (count); + DEBUG_ONLY (if + (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) + + T *getBuffer = getVector_.data (); + mpi::TaggedRecv (getBuffer, count, source, DATA_ACC_TAG, g.VCComm ()); + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += XCol[s]; } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; - } + getVector_.resize ( 0 ); + // Free the memory + //ProgressMatrix ( Z, source ); } -// replica of above, except the tag is different - template < typename T > -void AxpyInterface2::HandleGlobalToLocalAcc ( Matrix& Z, Int i, Int j ) +// handle request for data, post a matching issend +template < typename T > +void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j, + Int count, Int source ) { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalAcc")) + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) - if( i < 0 || j < 0 ) + if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForGet_ ) - LogicError("Local matrix cannot be updated"); + LogicError("Local matrix cannot be updated"); + DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -698,223 +561,49 @@ void AxpyInterface2::HandleGlobalToLocalAcc ( Matrix& Z, Int i, Int j ) const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; + std::vector putVector_; const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - - const Int YLDim = Y.LDim (); - - for( Int step=0; step -void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); - - const DistMatrix &X = *GlobalArrayPut_; - - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); const Int myRow = g.Row (); const Int myCol = g.Col (); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - std::vector < std::vector > getVector; - getVector.resize (p); + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); - if (i + height > X.Height () || j + width > X.Width ()) - LogicError("Submatrix out of bounds of global matrix"); + DEBUG_ONLY (if (count < Int (sizeof (T))) + LogicError ("Count was too small");) - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - const Int XLDim = X.LDim (); + putVector_.resize (count); + DEBUG_ONLY (if + (Int (putVector_.size ()) != count) LogicError ("Not enough space allocated");) - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; + T *sendBuffer = putVector_.data (); - for( Int step=0; step -void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); - const DistMatrix &X = *GlobalArrayGet_; - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - if (i + height > X.Height () || j + width > X.Width ()) - LogicError("Submatrix out of bounds of global matrix"); - - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - - const Int XLDim = X.LDim (); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; + // Fire off non-blocking send + mpi::TaggedSend (sendBuffer, count, source, + DATA_GET_TAG, g.VCComm ()); + // clear + //ProgressMatrix ( Z, source ); - for( Int step=0; step +// detach collectively +template void AxpyInterface2::Detach() { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Detach")) @@ -943,8 +632,10 @@ void AxpyInterface2::Detach() dataRequests_.clear(); dataRequestStatuses_.clear(); + requestRequests_.clear(); + requestRequestStatuses_.clear(); + matrixBase_.clear(); - opKind_.clear(); } template class AxpyInterface2; From 78eb49487c2184462425f9378e1df13f86886d16 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 12 Aug 2014 18:53:53 -0500 Subject: [PATCH 067/110] the coordinates are wrong (to-do next), but the structure is taking shape --- include/El/core/AxpyInterface2.0.hpp | 23 +- src/core/AxpyInterface2.0.cpp | 374 +++++++++++++++++---------- 2 files changed, 257 insertions(+), 140 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index e112d1beba..60ebef88b9 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -47,17 +47,21 @@ class AxpyInterface2 DATA_ACC_TAG =3, REQUEST_GET_TAG =4; - /* Meta */ + /* Request objects for send, recv and request op */ std::vector> - dataRequests_, requestRequests_; + sendRequests_, requestRequests_, + recvRequests_; + /* Request statuses for send, recv and request op */ std::vector> - dataRequestStatuses_, + sendRequestStatuses_, + recvRequestStatuses_, requestRequestStatuses_; + /* Stores matrix base addresses */ std::vector> matrixBase_; - /* Data */ + /* Receive and Send vectors */ std::vector>> - getVectors_, putVectors_; + recvVectors_, sendVectors_; DistMatrix* GlobalArrayPut_; DistMatrix* GlobalArrayGet_; @@ -72,12 +76,13 @@ class AxpyInterface2 std::deque &matrixBase, T * base_address ); - Int GetIndexForMatrix ( Matrix& Z, int rank ); - void ProgressMatrix ( Matrix& Z, int rank ); + bool TestRequests ( Matrix& Z ); + bool TestSends ( Matrix& Z ); + bool TestRecvs ( Matrix& Z ); - void HandleLocalToGlobalData( Matrix& Z, Int i, Int j, + void HandleGlobalToLocalData( Matrix& Z, Int i, Int j, Int count, Int source ); - void HandleGlobalToLocalData( Matrix& Z, Int i, Int j, + void HandleLocalToGlobalData( Matrix& Z, Int i, Int j, Int count, Int source ); void HandleLocalToGlobalAcc( Matrix& Z, Int i, Int j, Int count, Int source ); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 4112530873..d9d57c142c 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -10,13 +10,15 @@ which can be found in the LICENSE file in the root directory, or at // TODO bring back const interfaces // TODO localflush // TODO error checks +// TODO fix progress function and call it in appropriate places namespace El { template AxpyInterface2::AxpyInterface2() : GlobalArrayPut_(0), GlobalArrayGet_(0), - putVectors_(0), getVectors_(0), dataRequests_(0), matrixBase_(0), - dataRequestStatuses_(0), toBeAttachedForPut_(false), + sendVectors_(0), recvVectors_(0), sendRequests_(0), recvRequests_(0), + matrixBase_(0), sendRequestStatuses_(0), requestRequestStatuses_(0), + recvRequestStatuses_(0), toBeAttachedForPut_(false), toBeAttachedForGet_(false), attached_(false), detached_(false) { } @@ -34,11 +36,13 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) GlobalArrayGet_ = &Z; const Int p = Z.Grid ().Size(); - putVectors_.resize( p ); - getVectors_.resize( p ); - dataRequests_.resize (p); - dataRequestStatuses_.resize (p); + sendVectors_.resize( p ); + recvVectors_.resize( p ); + sendRequests_.resize (p); + recvRequests_.resize (p); requestRequests_.resize (p); + sendRequestStatuses_.resize (p); + recvRequestStatuses_.resize (p); requestRequestStatuses_.resize (p); matrixBase_.resize (p); } @@ -87,15 +91,17 @@ void AxpyInterface2::Attach( DistMatrix& Z ) const Grid& g = Z.Grid(); const Int p = g.Size (); - if (putVectors_.size() != p) + if (sendVectors_.size() != p) { - getVectors_.resize( p ); - putVectors_.resize( p ); - dataRequests_.resize (p); - dataRequestStatuses_.resize (p); - matrixBase_.resize (p); + recvVectors_.resize( p ); + sendVectors_.resize( p ); + sendRequests_.resize (p); + recvRequests_.resize (p); requestRequests_.resize (p); + sendRequestStatuses_.resize (p); + recvRequestStatuses_.resize (p); requestRequestStatuses_.resize (p); + matrixBase_.resize (p); } } @@ -113,7 +119,7 @@ Int AxpyInterface2::NextIndex dataVectors.resize (Index + 1); dataVectors[Index].resize (dataSize); requests.push_back (mpi::REQUEST_NULL); - requestStatus.push_back (true); + requestStatus.push_back ( true ); // stores Matrix base address by index matrixBase.push_back (base_address); @@ -168,16 +174,16 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); const Int index = - NextIndex (numEntries, putVectors_[destination], - dataRequests_[destination], - dataRequestStatuses_[destination], + NextIndex (numEntries, sendVectors_[destination], + sendRequests_[destination], + sendRequestStatuses_[destination], matrixBase_[destination], XBuffer); DEBUG_ONLY (if - (Int (putVectors_[destination][index].size ()) != + (Int (sendVectors_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = putVectors_[destination][index].data (); + T *sendBuffer = sendVectors_[destination][index].data (); for( Int t=0; t::Put( Matrix& Z, Int i, Int j ) // put request mpi::TaggedISSend (sendBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm (), - dataRequests_[destination][index]); + sendRequests_[destination][index]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -208,8 +214,9 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) LogicError ("Cannot perform this operation as matrix is not attached."); DistMatrix& X = *GlobalArrayGet_; - const Int height = X.Height (); - const Int width = X.Width (); + const Int height = Z.Height (); + const Int width = Z.Width (); + if (i + height > X.Height () || j + width > X.Width ()) LogicError ("Invalid AxpyGlobalToLocal submatrix"); @@ -222,20 +229,20 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) T* XBuffer = Z.Buffer(); // Send out the requests to all processes in the grid - // 0 byte send, no need to cancel or post matching - // receive + // 0 byte send for (Int rank = 0; rank < p; ++rank) { const Int bufferSize = 0; const Int index = - NextIndex (0, putVectors_[rank], + NextIndex (bufferSize, sendVectors_[rank], requestRequests_[rank], requestRequestStatuses_[rank], matrixBase_[rank], XBuffer); - + // Copy the request header into the send buffer - T *sendBuffer = putVectors_[rank][index].data (); + T *sendBuffer = sendVectors_[rank][index].data (); + // Begin the non-blocking send mpi::TaggedISSend (sendBuffer, bufferSize, rank, REQUEST_GET_TAG, g.VCComm (), requestRequests_[rank][index]); @@ -250,9 +257,8 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount < byte > (status); + const Int count = mpi::GetCount (status); recvVector_.resize (count); T *recvBuffer = recvVector_.data (); @@ -334,16 +340,16 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); const Int index = - NextIndex (numEntries, putVectors_[destination], - dataRequests_[destination], - dataRequestStatuses_[destination], + NextIndex (numEntries, sendVectors_[destination], + sendRequests_[destination], + sendRequestStatuses_[destination], matrixBase_[destination], XBuffer); DEBUG_ONLY (if - (Int (putVectors_[destination][index].size ()) != + (Int (sendVectors_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = putVectors_[destination][index].data (); + T *sendBuffer = sendVectors_[destination][index].data (); for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) for( Int s=0; s::Acc( Matrix& Z, Int i, Int j ) } } -// get index of a matrix for a particular process +// progress communication for a particular matrix +// progress requests template -Int AxpyInterface2::GetIndexForMatrix ( Matrix& Z, int rank ) +bool AxpyInterface2::TestRequests ( Matrix& Z ) { + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + + Int index; typename std::deque::iterator dit; - dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer ()); - const Int index = (dit - matrixBase_[rank].begin()); - //std::cout << "matrixBase size: " << matrixBase_[rank].size () << "\n"; - assert ( index != matrixBase_[rank].size () ); + + for (int rank = 0; rank < p; ++rank) + { + dit = std::find ( matrixBase_[rank].begin(), + matrixBase_[rank].end(), Z.LockedBuffer ()); + index = (dit - matrixBase_[rank].begin()); + + if ( index == matrixBase_[rank].size () ) + continue; + if ( requestRequestStatuses_[rank].size() == 0 ) + continue; + if ( !requestRequestStatuses_[rank][index] ) // nothing to do + continue; + + // test all send requests related to matrix + requestRequestStatuses_[rank][index] = !mpi::Test ( requestRequests_[rank][index] ); + if ( requestRequestStatuses_[rank][index] ) + return false; + } + return true; +} + +// progress sends +template +bool AxpyInterface2::TestSends ( Matrix& Z ) +{ + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); - return index; + Int index; + typename std::deque::iterator dit; + + for (int rank = 0; rank < p; ++rank) + { + dit = std::find ( matrixBase_[rank].begin(), + matrixBase_[rank].end(), Z.LockedBuffer ()); + index = (dit - matrixBase_[rank].begin()); + + if ( index == matrixBase_[rank].size () ) + continue; + if ( sendRequestStatuses_[rank].size() == 0 ) + continue; + if ( !sendRequestStatuses_[rank][index] ) // nothing to do + continue; + + // test all sends related to matrix + sendRequestStatuses_[rank][index] = !mpi::Test ( sendRequests_[rank][index] ); + if ( sendRequestStatuses_[rank][index] ) + return false; + + // if test is true, then it is safe to free buffer + sendVectors_[rank][index].resize (0); + } + return true; } -// progress communication for a particular matrix -// this could be used to progress sends and recvs +// progress recvs template -void AxpyInterface2::ProgressMatrix ( Matrix& Z, int rank ) +bool AxpyInterface2::TestRecvs ( Matrix& Z ) { DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); const Int p = g.Size(); - const Int index = GetIndexForMatrix ( Z, rank ); + + Int index; + typename std::deque::iterator dit; - if ( !dataRequestStatuses_[rank][index] ) // nothing to do - return; - // wait - mpi::Wait ( dataRequests_[rank][index] ); - dataRequestStatuses_[rank][index] = false; - getVectors_[rank][index].resize (0); - putVectors_[rank][index].resize (0); + for (int rank = 0; rank < p; ++rank) + { + dit = std::find ( matrixBase_[rank].begin(), + matrixBase_[rank].end(), Z.LockedBuffer ()); + index = (dit - matrixBase_[rank].begin()); + + if ( index == matrixBase_[rank].size () ) + continue; + if ( recvRequestStatuses_[rank].size() == 0 ) + continue; + if ( !recvRequestStatuses_[rank][index] ) // nothing to do + continue; + + // test all receives related to matrix + recvRequestStatuses_[rank][index] = !mpi::Test ( recvRequests_[rank][index] ); + if ( recvRequestStatuses_[rank][index] ) + return false; + + // if test is true, then it is safe to free buffer + recvVectors_[rank][index].resize (0); + } + return true; } // flush ensures local and remote completion @@ -403,35 +479,57 @@ template void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - - DistMatrix& Y = *GlobalArrayGet_; + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + DistMatrix& Y = *GlobalArrayPut_; const Grid& g = Y.Grid(); + mpi::Status status; + + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; - if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) + while ( !DONE ) { - switch (status.MPI_TAG) + // similar to HandleXYZ functions in original AxpyInterface + if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) { - case DATA_PUT_TAG: - { - const Int count = mpi::GetCount (status); - HandleLocalToGlobalData ( Z, i, j, count, status.MPI_SOURCE ); - break; - } - case DATA_ACC_TAG: - { - const Int count = mpi::GetCount (status); - HandleLocalToGlobalAcc ( Z, i, j, count, status.MPI_SOURCE ); - break; - } - case REQUEST_GET_TAG: - { - const Int count = mpi::GetCount (status); - HandleGlobalToLocalData ( Z, i, j, count, status.MPI_SOURCE ); - break; - } + switch (status.MPI_TAG) + { + case DATA_PUT_TAG: + { + const Int count = mpi::GetCount (status); + HandleLocalToGlobalData ( Z, i, j, count, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + const Int count = mpi::GetCount (status); + HandleLocalToGlobalAcc ( Z, i, j, count, status.MPI_SOURCE ); + break; + } + case REQUEST_GET_TAG: + { + const Int count = mpi::GetCount (status); + HandleGlobalToLocalData ( Z, i, j, count, status.MPI_SOURCE ); + break; + } + } + } + if ( nb_bar_active ) + { + DONE = mpi::Test ( nb_bar_request ); + } + else + { + // all sends (data or request) are complete + if ( TestSends( Z ) && TestRecvs( Z ) && TestRequests( Z ) ) + { + mpi::IBarrier ( g.VCComm(), nb_bar_request ); + nb_bar_active = true; + } } } } @@ -459,14 +557,15 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j, DEBUG_ONLY (if (count < Int (sizeof (T))) LogicError ("Count was too small");) - T* Buffer = Z.Buffer(); + getVector_.resize (count); DEBUG_ONLY (if (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) - T *getBuffer = getVector_.data (); + T *getBuffer = getVector_.data (); + // post receive mpi::TaggedRecv (getBuffer, count, source, DATA_PUT_TAG, g.VCComm ()); - + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); const Int colAlign = (Y.ColAlign () + i) % r; @@ -486,9 +585,8 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j, for (Int s = 0; s < localHeight; ++s) YCol[s] = XCol[s]; } - getVector_.resize ( 0 ); // Free the memory - //ProgressMatrix ( Z, source ); + getVector_.resize ( 0 ); } // replica of above function except this accumulates @@ -502,42 +600,44 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j, const Int c = g.Width (); const Int myRow = g.Row (); const Int myCol = g.Col (); - int height = Z.Height(); - int width = Z.Width(); + const int height = Z.Height(); + const int width = Z.Width(); + // data buffer std::vector getVector_; - DEBUG_ONLY (if (count < Int (sizeof (T))) - LogicError ("Count was too small");) - T* Buffer = Z.Buffer(); - getVector_.resize (count); - DEBUG_ONLY (if - (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) - - T *getBuffer = getVector_.data (); - mpi::TaggedRecv (getBuffer, count, source, DATA_ACC_TAG, g.VCComm ()); - - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); - - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += XCol[s]; - } - getVector_.resize ( 0 ); - // Free the memory - //ProgressMatrix ( Z, source ); + DEBUG_ONLY (if (count < Int (sizeof (T))) + LogicError ("Count was too small");) + // data buffer resize + getVector_.resize (count); + + DEBUG_ONLY (if + (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) + + T *getBuffer = getVector_.data (); + // post receive + mpi::TaggedRecv (getBuffer, count, source, DATA_ACC_TAG, g.VCComm ()); + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += XCol[s]; + } + // Free the memory + getVector_.resize ( 0 ); } // handle request for data, post a matching issend @@ -547,7 +647,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j, { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) - if( i < 0 || j < 0 ) + if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForGet_ ) LogicError("Local matrix cannot be updated"); @@ -561,7 +661,10 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j, const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; + // data buffer std::vector putVector_; + // dummy vector for 0 byte receive + std::vector dummyVector_; const Int XLDim = Z.LDim(); // local matrix width and height @@ -577,10 +680,18 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j, const Int iLocalOffset = Length (i, Y.ColShift (), r); const Int jLocalOffset = Length (j, Y.RowShift (), c); - DEBUG_ONLY (if (count < Int (sizeof (T))) + const Int numEntries = (localHeight * localWidth); + + DEBUG_ONLY (if (numEntries < Int (sizeof (T))) LogicError ("Count was too small");) - putVector_.resize (count); + // data/dummy buffers resize + putVector_.resize ( numEntries ); + dummyVector_.resize ( 0 ); + T *dummyBuffer = dummyVector_.data (); + // post request for get + mpi::TaggedRecv (dummyBuffer, count, source, REQUEST_GET_TAG, g.VCComm ()); + DEBUG_ONLY (if (Int (putVector_.size ()) != count) LogicError ("Not enough space allocated");) @@ -594,12 +705,10 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j, } // Fire off non-blocking send - mpi::TaggedSend (sendBuffer, count, source, - DATA_GET_TAG, g.VCComm ()); - // clear - //ProgressMatrix ( Z, source ); + mpi::TaggedSend (sendBuffer, count, source, DATA_GET_TAG, g.VCComm ()); putVector_.resize ( 0 ); + dummyVector_.resize ( 0 ); } // detach collectively @@ -607,9 +716,9 @@ template void AxpyInterface2::Detach() { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Detach")) - // destructor will call detach again... - if (detached_) - return; + // destructor will call detach again... + if (detached_) + return; if( !attached_ ) LogicError("Must attach before detaching."); @@ -627,12 +736,15 @@ void AxpyInterface2::Detach() GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - putVectors_.clear(); - getVectors_.clear(); + sendVectors_.clear(); + recvVectors_.clear(); - dataRequests_.clear(); - dataRequestStatuses_.clear(); + sendRequests_.clear(); + recvRequests_.clear(); requestRequests_.clear(); + + sendRequestStatuses_.clear(); + recvRequestStatuses_.clear(); requestRequestStatuses_.clear(); matrixBase_.clear(); From c63fda3db58531f815efc0841e9d665aa2029385 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 13 Aug 2014 12:28:32 -0500 Subject: [PATCH 068/110] intermediate commit --- include/El/core/AxpyInterface2.0.hpp | 28 +++- src/core/AxpyInterface2.0.cpp | 238 ++++++++++++++++++--------- 2 files changed, 181 insertions(+), 85 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 60ebef88b9..8420b98f75 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -50,19 +50,25 @@ class AxpyInterface2 /* Request objects for send, recv and request op */ std::vector> sendRequests_, requestRequests_, - recvRequests_; + recvRequests_, replyRequests_; + /* Request statuses for send, recv and request op */ std::vector> sendRequestStatuses_, recvRequestStatuses_, - requestRequestStatuses_; + requestRequestStatuses_, + replyRequestStatuses_; + /* Stores matrix base addresses */ std::vector> matrixBase_; + /* Receive and Send vectors */ std::vector>> - recvVectors_, sendVectors_; - + recvVectors_, sendVectors_, + replyVectors_, requestVectors_; + + // need to add const here... DistMatrix* GlobalArrayPut_; DistMatrix* GlobalArrayGet_; @@ -75,13 +81,23 @@ class AxpyInterface2 std::deque &requestStatus, std::deque &matrixBase, T * base_address ); + // note: this is just a placeholder + // would be replaced soon + Int NextIndex ( Int dataSize, + std::deque > &dataVectors, + std::deque &requests, + std::deque &requestStatus, + std::deque &matrixBase, + const T * base_address ); + /* Test */ + // probably we need const interfaces also? bool TestRequests ( Matrix& Z ); + bool TestReplies ( Matrix& Z ); bool TestSends ( Matrix& Z ); bool TestRecvs ( Matrix& Z ); - void HandleGlobalToLocalData( Matrix& Z, Int i, Int j, - Int count, Int source ); + void HandleGlobalToLocalData( Matrix& Z, Int i, Int j ); void HandleLocalToGlobalData( Matrix& Z, Int i, Int j, Int count, Int source ); void HandleLocalToGlobalAcc( Matrix& Z, Int i, Int j, diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index d9d57c142c..ba33be712c 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -16,11 +16,13 @@ namespace El template AxpyInterface2::AxpyInterface2() : GlobalArrayPut_(0), GlobalArrayGet_(0), - sendVectors_(0), recvVectors_(0), sendRequests_(0), recvRequests_(0), - matrixBase_(0), sendRequestStatuses_(0), requestRequestStatuses_(0), - recvRequestStatuses_(0), toBeAttachedForPut_(false), - toBeAttachedForGet_(false), attached_(false), - detached_(false) + sendVectors_(0), recvVectors_(0), replyVectors_(0), + requestVectors_(0), sendRequests_(0), recvRequests_(0), + replyRequests_(0), requestRequests_(0), matrixBase_(0), + sendRequestStatuses_(0), requestRequestStatuses_(0), + replyRequestStatuses_(0), recvRequestStatuses_(0), + toBeAttachedForPut_(false), toBeAttachedForGet_(false), + attached_(false), detached_(false) { } template @@ -36,14 +38,22 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) GlobalArrayGet_ = &Z; const Int p = Z.Grid ().Size(); + + requestVectors_.resize( p ); sendVectors_.resize( p ); recvVectors_.resize( p ); + replyVectors_.resize( p ); + sendRequests_.resize (p); recvRequests_.resize (p); + replyRequests_.resize (p); requestRequests_.resize (p); + sendRequestStatuses_.resize (p); recvRequestStatuses_.resize (p); requestRequestStatuses_.resize (p); + replyRequestStatuses_.resize (p); + matrixBase_.resize (p); } @@ -95,12 +105,19 @@ void AxpyInterface2::Attach( DistMatrix& Z ) { recvVectors_.resize( p ); sendVectors_.resize( p ); + replyVectors_.resize( p ); + requestVectors_.resize( p ); + sendRequests_.resize (p); recvRequests_.resize (p); requestRequests_.resize (p); + replyRequests_.resize (p); + sendRequestStatuses_.resize (p); recvRequestStatuses_.resize (p); requestRequestStatuses_.resize (p); + replyRequestStatuses_.resize (p); + matrixBase_.resize (p); } } @@ -118,8 +135,10 @@ Int AxpyInterface2::NextIndex dataVectors.resize (Index + 1); dataVectors[Index].resize (dataSize); + requests.push_back (mpi::REQUEST_NULL); requestStatus.push_back ( true ); + // stores Matrix base address by index matrixBase.push_back (base_address); @@ -207,11 +226,11 @@ template void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); DistMatrix& X = *GlobalArrayGet_; const Int height = Z.Height (); @@ -234,17 +253,17 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { const Int bufferSize = 0; const Int index = - NextIndex (bufferSize, sendVectors_[rank], + NextIndex (bufferSize, requestVectors_[rank], requestRequests_[rank], requestRequestStatuses_[rank], matrixBase_[rank], XBuffer); // Copy the request header into the send buffer - T *sendBuffer = sendVectors_[rank][index].data (); + T *requestBuffer = requestVectors_[rank][index].data (); - // Begin the non-blocking send - mpi::TaggedISSend (sendBuffer, bufferSize, rank, REQUEST_GET_TAG, g.VCComm (), + // Fire off non-blocking send + mpi::TaggedISSend (requestBuffer, bufferSize, rank, REQUEST_GET_TAG, g.VCComm (), requestRequests_[rank][index]); } // Receive all of the replies @@ -252,7 +271,7 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) while (numReplies < p) { mpi::Status status; - + HandleGlobalToLocalData ( Z, i, j ); if (mpi::IProbe (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { @@ -391,13 +410,16 @@ bool AxpyInterface2::TestRequests ( Matrix& Z ) continue; if ( requestRequestStatuses_[rank].size() == 0 ) continue; - if ( !requestRequestStatuses_[rank][index] ) // nothing to do - continue; - // test all send requests related to matrix - requestRequestStatuses_[rank][index] = !mpi::Test ( requestRequests_[rank][index] ); - if ( requestRequestStatuses_[rank][index] ) - return false; + const Int numStatuses = requestRequestStatuses_[rank].size(); + for (int i = 0; i < numStatuses; i++) + { + requestRequestStatuses_[rank][i] = !mpi::Test ( requestRequests_[rank][i] ); + if ( requestRequestStatuses_[rank][i] ) + return false; + } + // okay to deallocate + requestVectors_[rank][index].resize( 0 ); } return true; } @@ -423,13 +445,15 @@ bool AxpyInterface2::TestSends ( Matrix& Z ) continue; if ( sendRequestStatuses_[rank].size() == 0 ) continue; - if ( !sendRequestStatuses_[rank][index] ) // nothing to do - continue; // test all sends related to matrix - sendRequestStatuses_[rank][index] = !mpi::Test ( sendRequests_[rank][index] ); - if ( sendRequestStatuses_[rank][index] ) - return false; + const Int numStatuses = sendRequestStatuses_[rank].size(); + for (int i = 0; i < numStatuses; i++) + { + sendRequestStatuses_[rank][i] = !mpi::Test ( sendRequests_[rank][i] ); + if ( sendRequestStatuses_[rank][i] ) + return false; + } // if test is true, then it is safe to free buffer sendVectors_[rank][index].resize (0); @@ -458,13 +482,15 @@ bool AxpyInterface2::TestRecvs ( Matrix& Z ) continue; if ( recvRequestStatuses_[rank].size() == 0 ) continue; - if ( !recvRequestStatuses_[rank][index] ) // nothing to do - continue; - - // test all receives related to matrix - recvRequestStatuses_[rank][index] = !mpi::Test ( recvRequests_[rank][index] ); - if ( recvRequestStatuses_[rank][index] ) - return false; + + // test all sends related to matrix + const Int numStatuses = recvRequestStatuses_[rank].size(); + for (int i = 0; i < numStatuses; i++) + { + recvRequestStatuses_[rank][i] = !mpi::Test ( recvRequests_[rank][i] ); + if ( recvRequestStatuses_[rank][i] ) + return false; + } // if test is true, then it is safe to free buffer recvVectors_[rank][index].resize (0); @@ -472,6 +498,43 @@ bool AxpyInterface2::TestRecvs ( Matrix& Z ) return true; } +// progress replies +template +bool AxpyInterface2::TestReplies ( Matrix& Z ) +{ + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + + Int index; + typename std::deque::iterator dit; + + for (int rank = 0; rank < p; ++rank) + { + dit = std::find ( matrixBase_[rank].begin(), + matrixBase_[rank].end(), Z.LockedBuffer ()); + index = (dit - matrixBase_[rank].begin()); + + if ( index == matrixBase_[rank].size () ) + continue; + if ( replyRequestStatuses_[rank].size() == 0 ) + continue; + + // test all sends related to matrix + const Int numStatuses = replyRequestStatuses_[rank].size(); + for (int i = 0; i < numStatuses; i++) + { + replyRequestStatuses_[rank][i] = !mpi::Test ( replyRequests_[rank][i] ); + if ( replyRequestStatuses_[rank][i] ) + return false; + } + + // if test is true, then it is safe to free buffer + replyVectors_[rank][index].resize (0); + } + return true; +} + // flush ensures local and remote completion // this interface assumes a send has been issued // and will post a matching receive and progress @@ -512,8 +575,7 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) } case REQUEST_GET_TAG: { - const Int count = mpi::GetCount (status); - HandleGlobalToLocalData ( Z, i, j, count, status.MPI_SOURCE ); + HandleGlobalToLocalData ( Z, i, j ); break; } } @@ -525,7 +587,8 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) else { // all sends (data or request) are complete - if ( TestSends( Z ) && TestRecvs( Z ) && TestRequests( Z ) ) + if ( TestSends( Z ) && TestRecvs( Z ) + && TestRequests( Z ) && TestReplies ( Z ) ) { mpi::IBarrier ( g.VCComm(), nb_bar_request ); nb_bar_active = true; @@ -642,13 +705,12 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j, // handle request for data, post a matching issend template < typename T > -void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j, - Int count, Int source ) +void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForGet_ ) LogicError("Local matrix cannot be updated"); @@ -657,58 +719,72 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j, const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - // data buffer - std::vector putVector_; + const Int myRow = g.Row(); + const Int myCol = g.Col(); + // dummy vector for 0 byte receive std::vector dummyVector_; + + mpi::Status status; - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); + if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) + { + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); - const Int numEntries = (localHeight * localWidth); + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); - DEBUG_ONLY (if (numEntries < Int (sizeof (T))) - LogicError ("Count was too small");) + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); - // data/dummy buffers resize - putVector_.resize ( numEntries ); - dummyVector_.resize ( 0 ); - T *dummyBuffer = dummyVector_.data (); - // post request for get - mpi::TaggedRecv (dummyBuffer, count, source, REQUEST_GET_TAG, g.VCComm ()); + const Int numEntries = localHeight * localWidth; - DEBUG_ONLY (if - (Int (putVector_.size ()) != count) LogicError ("Not enough space allocated");) + DEBUG_ONLY (if (numEntries < Int (sizeof (T))) + LogicError ("Count was too small");) - T *sendBuffer = putVector_.data (); + const Int source = status.MPI_SOURCE; - for (Int t = 0; t < localWidth; ++t) - { - T *sendCol = &sendBuffer[t * localHeight]; - const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); - MemCopy (sendCol, XCol, localHeight); - } + // dummy buffers resize + dummyVector_.resize ( 0 ); + T *dummyBuffer = dummyVector_.data (); + // post receive request for get + mpi::TaggedRecv (dummyBuffer, 0, source, + REQUEST_GET_TAG, g.VCComm ()); + + T* XBuffer = Z.Buffer(); + const Int index = + NextIndex (numEntries, replyVectors_[source], + replyRequests_[source], + replyRequestStatuses_[source], + matrixBase_[source], + XBuffer); - // Fire off non-blocking send - mpi::TaggedSend (sendBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + DEBUG_ONLY (if + (Int (replyVectors_[source][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + T *replyBuffer = replyVectors_[source][index].data (); - putVector_.resize ( 0 ); - dummyVector_.resize ( 0 ); + for (Int t = 0; t < localWidth; ++t) + { + T *sendCol = &replyBuffer[t * localHeight]; + const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); + MemCopy (sendCol, XCol, localHeight); + } + + // Fire off non-blocking send + mpi::TaggedISSend (replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm (), replyRequests_[source][index]); + + dummyVector_.resize ( 0 ); + } } // detach collectively @@ -738,14 +814,18 @@ void AxpyInterface2::Detach() sendVectors_.clear(); recvVectors_.clear(); + replyVectors_.clear(); + requestVectors_.clear(); sendRequests_.clear(); recvRequests_.clear(); requestRequests_.clear(); + replyRequests_.clear(); sendRequestStatuses_.clear(); recvRequestStatuses_.clear(); requestRequestStatuses_.clear(); + replyRequestStatuses_.clear(); matrixBase_.clear(); } From fa86f05faf96edacb38072009872b99674c0c89f Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 13 Aug 2014 19:49:29 -0500 Subject: [PATCH 069/110] packed/unpacked send/receive for local to global...still bugfixing --- include/El/core/AxpyInterface2.0.hpp | 3 +- include/El/core/imports/mpi.hpp | 15 ++++++-- src/core/AxpyInterface2.0.cpp | 31 ++++++++++------- src/core/imports/mpi.cpp | 52 ++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 18 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 8420b98f75..03e2bc648a 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -100,8 +100,7 @@ class AxpyInterface2 void HandleGlobalToLocalData( Matrix& Z, Int i, Int j ); void HandleLocalToGlobalData( Matrix& Z, Int i, Int j, Int count, Int source ); - void HandleLocalToGlobalAcc( Matrix& Z, Int i, Int j, - Int count, Int source ); + void HandleLocalToGlobalAcc( Matrix& Z, Int count, Int source ); }; } // namespace El #endif // ifndef EL_AXPYINTERFACE2_HPP diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index c7b7d1f9d2..dfd58d58d7 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -148,6 +148,7 @@ const ErrorHandler ERRORS_RETURN = MPI_ERRORS_RETURN; const ErrorHandler ERRORS_ARE_FATAL = MPI_ERRORS_ARE_FATAL; const Group GROUP_EMPTY = MPI_GROUP_EMPTY; const Request REQUEST_NULL = MPI_REQUEST_NULL; + const Op MAX = MPI_MAX; const Op MIN = MPI_MIN; const Op MAXLOC = MPI_MAXLOC; @@ -228,15 +229,17 @@ void Translate // Utilities // --------- void SetWindowProp ( Window& window, int prop ); -void CheckBounds ( Window & window, Datatype win_type, Datatype type, +void CheckBounds ( Window & window, mpi::Datatype win_type, mpi::Datatype type, size_t count, ptrdiff_t target_offset ); void RmaProgress ( Comm comm ); +void PackCoordinates ( Int & i, Int & j, void *buffer, Int bufferSize, Comm comm ); +void UnpackCoordinates ( Int *i, Int *j, void *buffer, Int bufferSize, Comm comm ); // strided/vector to datatype void StridedDatatype (El_strided_t* stride_descr, - Datatype old_type, Datatype* new_type, + mpi::Datatype old_type, mpi::Datatype* new_type, size_t* source_dims); void VectorDatatype (El_iov_t * vect_descr, - Datatype old_type, Datatype * new_type, + mpi::Datatype old_type, mpi::Datatype * new_type, vector_pattern_t data_pattern); // Window creation/update/delete // ----------------------------- @@ -403,6 +406,12 @@ void TaggedISSend( T b, int to, int tag, Comm comm, Request& request ); // If the send count is one and the tag is irrelevant template void ISSend( T b, int to, Comm comm, Request& request ); +// Issend and Recv or MPI_PACKED +// ----------------------------- +void TaggedPackedISSend +( void* buf, Int bytes, int to, int tag, Comm comm, Request& request, Int i, Int j ); +void TaggedRecvUnpack +( void* buf, Int bytes, int from, int tag, Comm comm, Int *i, Int *j ); // Recv // ---- diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index ba33be712c..368b6d0068 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -9,8 +9,6 @@ which can be found in the LICENSE file in the root directory, or at // TODO Use DDT for put/get/acc when EL_USE_DERIVED_TYPE is defined // TODO bring back const interfaces // TODO localflush -// TODO error checks -// TODO fix progress function and call it in appropriate places namespace El { template @@ -354,7 +352,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - if( numEntries != 0 ) + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); @@ -369,7 +367,6 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) (Int (sendVectors_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) T *sendBuffer = sendVectors_[destination][index].data (); - for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) for( Int s=0; s::Flush( Matrix& Z, Int i, Int j ) case DATA_ACC_TAG: { const Int count = mpi::GetCount (status); - HandleLocalToGlobalAcc ( Z, i, j, count, status.MPI_SOURCE ); + HandleLocalToGlobalAcc ( Z, count, status.MPI_SOURCE ); break; } case REQUEST_GET_TAG: @@ -654,8 +657,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j, // replica of above function except this accumulates template < typename T > -void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j, - Int count, Int source ) +void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int source ) { DistMatrix &Y = *GlobalArrayPut_; const Grid & g = Y.Grid (); @@ -677,9 +679,12 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int i, Int j, (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) T *getBuffer = getVector_.data (); + Int i, j; // post receive - mpi::TaggedRecv (getBuffer, count, source, DATA_ACC_TAG, g.VCComm ()); - + //mpi::TaggedRecv (getBuffer, count, source, DATA_ACC_TAG, g.VCComm ()); + mpi::TaggedRecvUnpack ( getBuffer, count * sizeof(T), + source, DATA_ACC_TAG, g.VCComm (), &i, &j ); + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); const Int colAlign = (Y.ColAlign () + i) % r; diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 92a0a84ce8..f1df3c5fe8 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -501,6 +501,25 @@ void RmaProgress ( Comm comm ) SafeMpi (MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, comm.comm, &flag, MPI_STATUS_IGNORE)); } + +void PackCoordinates ( Int & i, Int & j, int *position, + void *buffer, Int bufferSize, Comm comm ) +{ + SafeMpi ( MPI_Pack (&i, 1, MPI_INT, buffer, + bufferSize, position, comm.comm) ); + SafeMpi ( MPI_Pack (&j, 1, MPI_INT, buffer, + bufferSize, position, comm.comm) ); +} + +void UnpackCoordinates ( Int *i, Int *j, int *position, + void *buffer, Int bufferSize, Comm comm ) +{ + SafeMpi ( MPI_Unpack (buffer, bufferSize, position, + i, 1, MPI_INT, comm.comm) ); + SafeMpi ( MPI_Unpack (buffer, bufferSize, position, + j, 1, MPI_INT, comm.comm) ); +} + // TODO these functions for DDT creation are // completely untested #ifdef EL_USE_DERIVED_DATATYPE @@ -1969,6 +1988,39 @@ template void TaggedISSend (Complex < double >b, int to, int tag, Comm comm, Request & request); +// Issend and Recv for MPI_PACKED +// ------------------------------ +void TaggedPackedISSend +( void* buf, Int bytes, int to, int tag, Comm comm, Request& request, int i, int j ) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::ISSend")) + int position = 0; + // pack coordinates + PackCoordinates ( i, j, &position, buf, bytes, comm ); + // Fire off nonblocking sends + SafeMpi + (MPI_Issend + ( buf, bytes, + MPI_PACKED, to, tag, comm.comm, + &request)); +} + +void TaggedRecvUnpack +( void* buf, Int bytes, int from, int tag, Comm comm, int *i, int *j ) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) + Status status; + // receive + SafeMpi (MPI_Recv + ( buf, bytes, MPI_PACKED, from, tag, + comm.comm, &status)); + // unpack coordinates + int position = 0; + UnpackCoordinates ( i, j, &position, buf, bytes, comm ); +} + +// Recv +// ---- template < typename R > void TaggedRecv (R * buf, int count, int from, int tag, Comm comm) From c58be270c5a8c5e4a412d4f8c36c2124d2664132 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 14 Aug 2014 13:46:46 -0500 Subject: [PATCH 070/110] remove all packed send/recv stuff, falling back to send/recving coordinates as the base version for axpy2...and yeah, this works --- include/El/core/AxpyInterface2.0.hpp | 49 +++++-- include/El/core/imports/mpi.hpp | 9 +- src/core/AxpyInterface2.0.cpp | 208 ++++++++++++++++++++++----- src/core/imports/mpi.cpp | 56 +------- 4 files changed, 210 insertions(+), 112 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 03e2bc648a..eb5cf25174 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -45,23 +45,30 @@ class AxpyInterface2 DATA_PUT_TAG =1, DATA_GET_TAG =2, DATA_ACC_TAG =3, - REQUEST_GET_TAG =4; + REQUEST_GET_TAG =4, + IJ_TAG =5; /* Request objects for send, recv and request op */ std::vector> sendRequests_, requestRequests_, - recvRequests_, replyRequests_; + recvRequests_, replyRequests_, + sendIJRequests_; /* Request statuses for send, recv and request op */ std::vector> sendRequestStatuses_, recvRequestStatuses_, requestRequestStatuses_, - replyRequestStatuses_; + replyRequestStatuses_, + sendIJRequestStatuses_; /* Stores matrix base addresses */ std::vector> matrixBase_; + + /* Stores i, j coordinates */ + std::vector>> + coordVectors_; /* Receive and Send vectors */ std::vector>> @@ -90,16 +97,38 @@ class AxpyInterface2 std::deque &matrixBase, const T * base_address ); + Int NextIndex + ( Int i, Int j, Int dataSize, + std::deque > &coordVectors, + std::deque > &dataVectors, + std::deque &requestData, + std::deque &requestDataStatus, + std::deque &requestCoord, + std::deque &requestCoordStatus, + std::deque &matrixBase, + T * base_address); + + Int NextIndex + ( Int i, Int j, Int dataSize, + std::deque > &coordVectors, + std::deque > &dataVectors, + std::deque &requestData, + std::deque &requestDataStatus, + std::deque &requestCoord, + std::deque &requestCoordStatus, + std::deque &matrixBase, + const T * base_address); + /* Test */ // probably we need const interfaces also? - bool TestRequests ( Matrix& Z ); - bool TestReplies ( Matrix& Z ); - bool TestSends ( Matrix& Z ); - bool TestRecvs ( Matrix& Z ); + bool TestRequests ( Matrix& Z ); + bool TestReplies ( Matrix& Z ); + bool TestSends ( Matrix& Z ); + bool TestRecvs ( Matrix& Z ); + bool TestSendsCoord ( Matrix& Z ); - void HandleGlobalToLocalData( Matrix& Z, Int i, Int j ); - void HandleLocalToGlobalData( Matrix& Z, Int i, Int j, - Int count, Int source ); + void HandleGlobalToLocalData( Matrix& Z ); + void HandleLocalToGlobalData( Matrix& Z, Int count, Int source ); void HandleLocalToGlobalAcc( Matrix& Z, Int count, Int source ); }; } // namespace El diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index dfd58d58d7..690833e27d 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -232,8 +232,7 @@ void SetWindowProp ( Window& window, int prop ); void CheckBounds ( Window & window, mpi::Datatype win_type, mpi::Datatype type, size_t count, ptrdiff_t target_offset ); void RmaProgress ( Comm comm ); -void PackCoordinates ( Int & i, Int & j, void *buffer, Int bufferSize, Comm comm ); -void UnpackCoordinates ( Int *i, Int *j, void *buffer, Int bufferSize, Comm comm ); + // strided/vector to datatype void StridedDatatype (El_strided_t* stride_descr, mpi::Datatype old_type, mpi::Datatype* new_type, @@ -406,12 +405,6 @@ void TaggedISSend( T b, int to, int tag, Comm comm, Request& request ); // If the send count is one and the tag is irrelevant template void ISSend( T b, int to, Comm comm, Request& request ); -// Issend and Recv or MPI_PACKED -// ----------------------------- -void TaggedPackedISSend -( void* buf, Int bytes, int to, int tag, Comm comm, Request& request, Int i, Int j ); -void TaggedRecvUnpack -( void* buf, Int bytes, int from, int tag, Comm comm, Int *i, Int *j ); // Recv // ---- diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 368b6d0068..b031d08159 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -14,13 +14,13 @@ namespace El template AxpyInterface2::AxpyInterface2() : GlobalArrayPut_(0), GlobalArrayGet_(0), - sendVectors_(0), recvVectors_(0), replyVectors_(0), + sendVectors_(0), recvVectors_(0), replyVectors_(0), coordVectors_(0), requestVectors_(0), sendRequests_(0), recvRequests_(0), - replyRequests_(0), requestRequests_(0), matrixBase_(0), - sendRequestStatuses_(0), requestRequestStatuses_(0), + replyRequests_(0), requestRequests_(0), sendIJRequests_(0), + matrixBase_(0), sendRequestStatuses_(0), requestRequestStatuses_(0), replyRequestStatuses_(0), recvRequestStatuses_(0), - toBeAttachedForPut_(false), toBeAttachedForGet_(false), - attached_(false), detached_(false) + sendIJRequestStatuses_(0), toBeAttachedForPut_(false), + toBeAttachedForGet_(false), attached_(false), detached_(false) { } template @@ -41,16 +41,19 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) sendVectors_.resize( p ); recvVectors_.resize( p ); replyVectors_.resize( p ); + coordVectors_.resize( p ); sendRequests_.resize (p); recvRequests_.resize (p); replyRequests_.resize (p); requestRequests_.resize (p); + sendIJRequests_.resize (p); sendRequestStatuses_.resize (p); recvRequestStatuses_.resize (p); requestRequestStatuses_.resize (p); replyRequestStatuses_.resize (p); + sendIJRequestStatuses_.resize (p); matrixBase_.resize (p); } @@ -105,16 +108,19 @@ void AxpyInterface2::Attach( DistMatrix& Z ) sendVectors_.resize( p ); replyVectors_.resize( p ); requestVectors_.resize( p ); + coordVectors_.resize( p ); sendRequests_.resize (p); recvRequests_.resize (p); requestRequests_.resize (p); replyRequests_.resize (p); + sendIJRequests_.resize (p); sendRequestStatuses_.resize (p); recvRequestStatuses_.resize (p); requestRequestStatuses_.resize (p); replyRequestStatuses_.resize (p); + sendIJRequestStatuses_.resize (p); matrixBase_.resize (p); } @@ -143,6 +149,41 @@ Int AxpyInterface2::NextIndex return Index; } +template +Int AxpyInterface2::NextIndex +( Int i, Int j, Int dataSize, + std::deque > &coordVectors, + std::deque > &dataVectors, + std::deque &requestData, + std::deque &requestDataStatus, + std::deque &requestCoord, + std::deque &requestCoordStatus, + std::deque &matrixBase, + T * base_address) +{ + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) + const Int Index = Int(requestData.size ()); + + dataVectors.resize (Index + 1); + dataVectors[Index].resize (dataSize); + + coordVectors.resize (Index + 1); + coordVectors[Index].resize (2); + coordVectors[Index][0] = i; + coordVectors[Index][1] = j; + + requestData.push_back (mpi::REQUEST_NULL); + requestDataStatus.push_back ( true ); + + requestCoord.push_back (mpi::REQUEST_NULL); + requestCoordStatus.push_back ( true ); + + // stores Matrix base address by index + matrixBase.push_back (base_address); + + return Index; +} + template void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) { @@ -191,9 +232,13 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); const Int index = - NextIndex (numEntries, sendVectors_[destination], + NextIndex (i, j, numEntries, + coordVectors_[destination], + sendVectors_[destination], sendRequests_[destination], sendRequestStatuses_[destination], + sendIJRequests_[destination], + sendIJRequestStatuses_[destination], matrixBase_[destination], XBuffer); @@ -213,7 +258,11 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) mpi::TaggedISSend (sendBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm (), sendRequests_[destination][index]); - } + // send coordinates + Int *coord = coordVectors_[destination][index].data (); + mpi::TaggedISSend (coord, 2, destination, IJ_TAG, g.VCComm (), + sendIJRequests_[destination][index]); + } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; @@ -251,9 +300,13 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { const Int bufferSize = 0; const Int index = - NextIndex (bufferSize, requestVectors_[rank], + NextIndex (i, j, bufferSize, + coordVectors_[rank], + requestVectors_[rank], requestRequests_[rank], requestRequestStatuses_[rank], + sendIJRequests_[rank], + sendIJRequestStatuses_[rank], matrixBase_[rank], XBuffer); @@ -261,15 +314,22 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) T *requestBuffer = requestVectors_[rank][index].data (); // Fire off non-blocking send - mpi::TaggedISSend (requestBuffer, bufferSize, rank, REQUEST_GET_TAG, g.VCComm (), + mpi::TaggedISSend (requestBuffer, bufferSize, rank, + REQUEST_GET_TAG, g.VCComm (), requestRequests_[rank][index]); + + // send coordinates + Int *coord = coordVectors_[rank][index].data (); + mpi::TaggedISSend (coord, 2, rank, + IJ_TAG, g.VCComm (), + sendIJRequests_[rank][index]); } // Receive all of the replies Int numReplies = 0; while (numReplies < p) { mpi::Status status; - HandleGlobalToLocalData ( Z, i, j ); + HandleGlobalToLocalData ( Z ); if (mpi::IProbe (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { @@ -357,33 +417,39 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); const Int index = - NextIndex (numEntries, sendVectors_[destination], + NextIndex (i, j, numEntries, + coordVectors_[destination], + sendVectors_[destination], sendRequests_[destination], sendRequestStatuses_[destination], + sendIJRequests_[destination], + sendIJRequestStatuses_[destination], matrixBase_[destination], XBuffer); DEBUG_ONLY (if (Int (sendVectors_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) + T *sendBuffer = sendVectors_[destination][index].data (); - for( Int t=0; t::TestReplies ( Matrix& Z ) return true; } +// progress coordinate sends +template +bool AxpyInterface2::TestSendsCoord ( Matrix& Z ) +{ + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + + Int index; + typename std::deque::iterator dit; + + for (int rank = 0; rank < p; ++rank) + { + dit = std::find ( matrixBase_[rank].begin(), + matrixBase_[rank].end(), Z.LockedBuffer ()); + index = (dit - matrixBase_[rank].begin()); + + if ( index == matrixBase_[rank].size () ) + continue; + if ( sendIJRequestStatuses_[rank].size() == 0 ) + continue; + + // test all sends related to matrix + const Int numStatuses = sendIJRequestStatuses_[rank].size(); + for (int i = 0; i < numStatuses; i++) + { + sendIJRequestStatuses_[rank][i] = !mpi::Test ( sendIJRequests_[rank][i] ); + if ( sendIJRequestStatuses_[rank][i] ) + return false; + } + // if test is true, then it is safe to free buffer + coordVectors_[rank][index].resize (0); + } + return true; +} // flush ensures local and remote completion // this interface assumes a send has been issued // and will post a matching receive and progress @@ -567,7 +668,7 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) case DATA_PUT_TAG: { const Int count = mpi::GetCount (status); - HandleLocalToGlobalData ( Z, i, j, count, status.MPI_SOURCE ); + HandleLocalToGlobalData ( Z, count, status.MPI_SOURCE ); break; } case DATA_ACC_TAG: @@ -578,7 +679,7 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) } case REQUEST_GET_TAG: { - HandleGlobalToLocalData ( Z, i, j ); + HandleGlobalToLocalData ( Z ); break; } } @@ -589,9 +690,13 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) } else { - // all sends (data or request) are complete - if ( TestSends( Z ) && TestRecvs( Z ) - && TestRequests( Z ) && TestReplies ( Z ) ) + // check if all sends (data or request) are + // complete for a particular matrix + if ( TestSends( Z ) + && TestRecvs( Z ) + && TestRequests( Z ) + && TestReplies ( Z ) + && TestSendsCoord ( Z ) ) { mpi::IBarrier ( g.VCComm(), nb_bar_request ); nb_bar_active = true; @@ -600,6 +705,7 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) } } +// all communications pertaining to matrix Z template void AxpyInterface2::Flush( Matrix& Z ) { @@ -608,8 +714,7 @@ void AxpyInterface2::Flush( Matrix& Z ) } template < typename T > -void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j, - Int count, Int source ) +void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int source ) { DistMatrix &Y = *GlobalArrayPut_; const Grid & g = Y.Grid (); @@ -620,18 +725,26 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j, int height = Z.Height(); int width = Z.Width(); std::vector getVector_; + std::vector getCoord_; DEBUG_ONLY (if (count < Int (sizeof (T))) LogicError ("Count was too small");) getVector_.resize (count); + getCoord_.resize (2); + DEBUG_ONLY (if (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) T *getBuffer = getVector_.data (); // post receive mpi::TaggedRecv (getBuffer, count, source, DATA_PUT_TAG, g.VCComm ()); - + + // post receive for coordinates + Int *coord = getCoord_.data(); + mpi::TaggedRecv (coord, 2, source, IJ_TAG, g.VCComm ()); + Int i = coord[0]; Int j = coord[1]; + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); const Int colAlign = (Y.ColAlign () + i) % r; @@ -653,6 +766,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int i, Int j, } // Free the memory getVector_.resize ( 0 ); + getCoord_.resize ( 0 ); } // replica of above function except this accumulates @@ -668,23 +782,29 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so const int height = Z.Height(); const int width = Z.Width(); // data buffer - std::vector getVector_; + std::vector< T > getVector_; + std::vector getCoord_; DEBUG_ONLY (if (count < Int (sizeof (T))) LogicError ("Count was too small");) // data buffer resize getVector_.resize (count); + getCoord_.resize (2); DEBUG_ONLY (if (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) + + // post receive for data T *getBuffer = getVector_.data (); - Int i, j; - // post receive - //mpi::TaggedRecv (getBuffer, count, source, DATA_ACC_TAG, g.VCComm ()); - mpi::TaggedRecvUnpack ( getBuffer, count * sizeof(T), - source, DATA_ACC_TAG, g.VCComm (), &i, &j ); - + mpi::TaggedRecv (getBuffer, count, source, + DATA_ACC_TAG, g.VCComm ()); + + // post receive for coordinates + Int *coord = getCoord_.data(); + mpi::TaggedRecv (coord, 2, source, IJ_TAG, g.VCComm ()); + Int i = coord[0]; Int j = coord[1]; + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); const Int colAlign = (Y.ColAlign () + i) % r; @@ -706,16 +826,15 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so } // Free the memory getVector_.resize ( 0 ); + getCoord_.resize ( 0 ); } // handle request for data, post a matching issend template < typename T > -void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) +void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForGet_ ) LogicError("Local matrix cannot be updated"); @@ -729,11 +848,20 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) // dummy vector for 0 byte receive std::vector dummyVector_; + std::vector getCoord_; mpi::Status status; if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) { + const Int source = status.MPI_SOURCE; + getCoord_.resize (2); + + // post receive for coordinates + Int *coord = getCoord_.data(); + mpi::TaggedRecv (coord, 2, source, IJ_TAG, g.VCComm ()); + Int i = coord[0]; Int j = coord[1]; + const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; @@ -755,8 +883,6 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) DEBUG_ONLY (if (numEntries < Int (sizeof (T))) LogicError ("Count was too small");) - const Int source = status.MPI_SOURCE; - // dummy buffers resize dummyVector_.resize ( 0 ); T *dummyBuffer = dummyVector_.data (); @@ -789,6 +915,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z, Int i, Int j ) DATA_GET_TAG, g.VCComm (), replyRequests_[source][index]); dummyVector_.resize ( 0 ); + getCoord_.resize ( 0 ); } } @@ -818,16 +945,19 @@ void AxpyInterface2::Detach() GlobalArrayGet_ = 0; sendVectors_.clear(); + coordVectors_.clear(); recvVectors_.clear(); replyVectors_.clear(); requestVectors_.clear(); + sendIJRequests_.clear(); sendRequests_.clear(); recvRequests_.clear(); requestRequests_.clear(); replyRequests_.clear(); sendRequestStatuses_.clear(); + sendIJRequestStatuses_.clear(); recvRequestStatuses_.clear(); requestRequestStatuses_.clear(); replyRequestStatuses_.clear(); diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index f1df3c5fe8..5a4b98c0bf 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -501,28 +501,8 @@ void RmaProgress ( Comm comm ) SafeMpi (MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, comm.comm, &flag, MPI_STATUS_IGNORE)); } - -void PackCoordinates ( Int & i, Int & j, int *position, - void *buffer, Int bufferSize, Comm comm ) -{ - SafeMpi ( MPI_Pack (&i, 1, MPI_INT, buffer, - bufferSize, position, comm.comm) ); - SafeMpi ( MPI_Pack (&j, 1, MPI_INT, buffer, - bufferSize, position, comm.comm) ); -} - -void UnpackCoordinates ( Int *i, Int *j, int *position, - void *buffer, Int bufferSize, Comm comm ) -{ - SafeMpi ( MPI_Unpack (buffer, bufferSize, position, - i, 1, MPI_INT, comm.comm) ); - SafeMpi ( MPI_Unpack (buffer, bufferSize, position, - j, 1, MPI_INT, comm.comm) ); -} - // TODO these functions for DDT creation are // completely untested -#ifdef EL_USE_DERIVED_DATATYPE void StridedDatatype (El_strided_t* stride_descr, Datatype old_type, Datatype* new_type, size_t* source_dims) @@ -567,7 +547,7 @@ void StridedDatatype (El_strided_t* stride_descr, reinterpret_cast(sizes), reinterpret_cast(stride_descr->offsets), MPI_ORDER_C, old_type, new_type) ); - + delete[] dims; delete[] sizes; } @@ -649,7 +629,6 @@ void VectorDatatype (El_iov_t * vect_descr, (const int *) vect_descr->sizes, vect_descr->offsets, old_type, new_type) ); } -#endif void WindowFree (Window & window) { @@ -1988,39 +1967,6 @@ template void TaggedISSend (Complex < double >b, int to, int tag, Comm comm, Request & request); -// Issend and Recv for MPI_PACKED -// ------------------------------ -void TaggedPackedISSend -( void* buf, Int bytes, int to, int tag, Comm comm, Request& request, int i, int j ) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::ISSend")) - int position = 0; - // pack coordinates - PackCoordinates ( i, j, &position, buf, bytes, comm ); - // Fire off nonblocking sends - SafeMpi - (MPI_Issend - ( buf, bytes, - MPI_PACKED, to, tag, comm.comm, - &request)); -} - -void TaggedRecvUnpack -( void* buf, Int bytes, int from, int tag, Comm comm, int *i, int *j ) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) - Status status; - // receive - SafeMpi (MPI_Recv - ( buf, bytes, MPI_PACKED, from, tag, - comm.comm, &status)); - // unpack coordinates - int position = 0; - UnpackCoordinates ( i, j, &position, buf, bytes, comm ); -} - -// Recv -// ---- template < typename R > void TaggedRecv (R * buf, int count, int from, int tag, Comm comm) From 3b055f3dfe307531b24ad03a4d2d90fdc071490b Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 14 Aug 2014 16:19:21 -0500 Subject: [PATCH 071/110] updated the nbc version of axpy original, this is better than before --- include/El/core/AxpyInterface.hpp | 12 ++- src/core/AxpyInterface.cpp | 148 +++++++++++++++++++++++------- 2 files changed, 123 insertions(+), 37 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index ce8c30ab68..c0a8d3d5a6 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -48,16 +48,16 @@ class AxpyInterface DATA_REPLY_TAG =4; //request object for polling on Issends -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - byte all_sends_are_finished; -#endif bool attachedForLocalToGlobal_, attachedForGlobalToLocal_; DistMatrix* localToGlobalMat_; const DistMatrix* globalToLocalMat_; +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else std::vector sentEomTo_, haveEomFrom_; std::vector eomSendRequests_; +#endif std::vector> sendingData_, sendingRequest_, sendingReply_; @@ -74,10 +74,13 @@ class AxpyInterface bool Finished(); // Progress functions void UpdateRequestStatuses(); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + bool ReturnRequestStatuses(); +#else void HandleEoms(); void StartSendingEoms(); void FinishSendingEoms(); - +#endif Int ReadyForSend ( Int sendSize, std::deque>& sendVectors, @@ -92,5 +95,4 @@ class AxpyInterface }; } // namespace El - #endif // ifndef EL_AXPYINTERFACE_HPP diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 04785bbbcc..6eb07b99d0 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -15,6 +15,8 @@ namespace El { +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else template < typename T > bool AxpyInterface < T >::Finished () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Finished"); @@ -97,7 +99,8 @@ namespace El haveEomFrom_[source] = true; } } - +#endif + template < typename T > void AxpyInterface < T >::HandleLocalToGlobalData () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) @@ -283,15 +286,21 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) const Int p = Z.Grid().Size(); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else sentEomTo_.resize( p, false ); haveEomFrom_.resize( p, false ); +#endif sendingData_.resize( p ); sendingRequest_.resize( p ); sendingReply_.resize( p ); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else eomSendRequests_.resize( p ); - +#endif + dataSendRequests_.resize( p ); requestSendRequests_.resize( p ); replySendRequests_.resize( p ); @@ -320,15 +329,21 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) const Int p = X.Grid ().Size (); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); +#endif sendingData_.resize (p); sendingRequest_.resize (p); sendingReply_.resize (p); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else eomSendRequests_.resize (p); - +#endif + dataSendRequests_.resize (p); requestSendRequests_.resize (p); replySendRequests_.resize (p); @@ -384,21 +399,24 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) attachedForGlobalToLocal_ = true; globalToLocalMat_ = &Z; } -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = '0'; -#endif const Int p = Z.Grid ().Size (); - + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else // eom sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); +#endif // request objects sendingRequest_.resize (p); sendingData_.resize (p); sendingReply_.resize (p); - + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else eomSendRequests_.resize (p); - +#endif + // ready-to-send requestSendRequests_.resize (p); replySendRequests_.resize (p); @@ -426,21 +444,25 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) attachedForGlobalToLocal_ = true; globalToLocalMat_ = &X; } -#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = '0'; -#endif const Int p = X.Grid ().Size (); +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else // eom sentEomTo_.resize (p, false); haveEomFrom_.resize (p, false); +#endif + // ready-to-send sendingRequest_.resize (p); sendingData_.resize (p); sendingReply_.resize (p); - + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else eomSendRequests_.resize (p); - +#endif + // ready-to-send requestSendRequests_.resize (p); replySendRequests_.resize (p); @@ -711,8 +733,48 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); } } - - template < typename T > void AxpyInterface < T >::Detach () + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +template < typename T > bool AxpyInterface < T >::ReturnRequestStatuses () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + const Int p = g.Size (); + + for (Int i = 0; i < p; ++i) + { + const Int numDataSendRequests = dataSendRequests_[i].size (); + for (Int j = 0; j < numDataSendRequests; ++j) + { + if (sendingData_[i][j]) + sendingData_[i][j] = !mpi::Test (dataSendRequests_[i][j]); + if (sendingData_[i][j]) + return false; + } + const Int numRequestSendRequests = requestSendRequests_[i].size (); + for (Int j = 0; j < numRequestSendRequests; ++j) + { + if (sendingRequest_[i][j]) + sendingRequest_[i][j] = !mpi::Test (requestSendRequests_[i][j]); + if (sendingRequest_[i][j]) + return false; + } + const Int numReplySendRequests = replySendRequests_[i].size (); + for (Int j = 0; j < numReplySendRequests; ++j) + { + if (sendingReply_[i][j]) + sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); + if (sendingReply_[i][j]) + return false; + } + } + return true; + } +#endif + +template < typename T > void AxpyInterface < T >::Detach () { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Detach")) if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) @@ -729,17 +791,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; - // progress my issends - for (int i = 0; i < dataSendRequests_[me].size(); i++) - { - if ( !mpi::Test ( dataSendRequests_[me][i] )) - mpi::Wait ( dataSendRequests_[me][i] ); - sendingData_[me][i] = false; - } - // nonblocking ssends must have been issued - all_sends_are_finished = '1'; - // spin till all messages sent have been - // received + while (!DONE) { // probes for incoming message and @@ -753,7 +805,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } else { - if (all_sends_are_finished == '1') + if ( ReturnRequestStatuses() ) { // all ssends are complete, start nonblocking barrier mpi::IBarrier (g.VCComm (), nb_bar_request); @@ -772,22 +824,51 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } else { +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + + while ( !DONE ) + { + // probes for incoming request message + // and sends + HandleGlobalToLocalRequest (); + + if (nb_bar_active) + { + // test/wait for IBarrier completion + DONE = mpi::Test (nb_bar_request); + } + else + { + if ( ReturnRequestStatuses() ) + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier (g.VCComm (), nb_bar_request); + nb_bar_active = true; + } + } + } +#else while (!Finished ()) { HandleGlobalToLocalRequest (); HandleEoms (); } mpi::Barrier (g.VCComm ()); +#endif } attachedForLocalToGlobal_ = false; attachedForGlobalToLocal_ = false; recvVector_.clear(); + #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) - all_sends_are_finished = '0'; -#endif +#else sentEomTo_.clear(); haveEomFrom_.clear(); +#endif sendingData_.clear(); sendingRequest_.clear(); @@ -800,9 +881,12 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) dataSendRequests_.clear(); requestSendRequests_.clear(); replySendRequests_.clear(); - + +#if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) +#else eomSendRequests_.clear(); - } +#endif + } template class AxpyInterface < Int >; template class AxpyInterface < float >; From 02d3b14ffa9c975ef117df75e5f3479fc71cb8cc Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Sat, 16 Aug 2014 00:42:06 -0500 Subject: [PATCH 072/110] modify the ssends to isend for small messages --- include/El/core/AxpyInterface2.0.hpp | 20 +-- src/core/AxpyInterface2.0.cpp | 242 ++++++++++++--------------- 2 files changed, 111 insertions(+), 151 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index eb5cf25174..3731d9a101 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -46,7 +46,7 @@ class AxpyInterface2 DATA_GET_TAG =2, DATA_ACC_TAG =3, REQUEST_GET_TAG =4, - IJ_TAG =5; + COORD_IJ_TAG =5; /* Request objects for send, recv and request op */ std::vector> @@ -82,41 +82,31 @@ class AxpyInterface2 bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; - Int NextIndex ( Int dataSize, + Int NextIndex ( Int rank, Int dataSize, std::deque > &dataVectors, std::deque &requests, std::deque &requestStatus, - std::deque &matrixBase, T * base_address ); // note: this is just a placeholder // would be replaced soon - Int NextIndex ( Int dataSize, + Int NextIndex ( Int rank, Int dataSize, std::deque > &dataVectors, std::deque &requests, std::deque &requestStatus, - std::deque &matrixBase, const T * base_address ); Int NextIndex - ( Int i, Int j, Int dataSize, - std::deque > &coordVectors, + ( Int rank, Int i, Int j, Int dataSize, std::deque > &dataVectors, std::deque &requestData, std::deque &requestDataStatus, - std::deque &requestCoord, - std::deque &requestCoordStatus, - std::deque &matrixBase, T * base_address); Int NextIndex - ( Int i, Int j, Int dataSize, - std::deque > &coordVectors, + ( Int rank, Int i, Int j, Int dataSize, std::deque > &dataVectors, std::deque &requestData, std::deque &requestDataStatus, - std::deque &requestCoord, - std::deque &requestCoordStatus, - std::deque &matrixBase, const T * base_address); /* Test */ diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index b031d08159..a436a11e20 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -128,10 +128,10 @@ void AxpyInterface2::Attach( DistMatrix& Z ) template Int AxpyInterface2::NextIndex -( Int dataSize, std::deque > &dataVectors, +( Int rank, Int dataSize, + std::deque > &dataVectors, std::deque &requests, std::deque &requestStatus, - std::deque &matrixBase, T * base_address ) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) @@ -144,21 +144,17 @@ Int AxpyInterface2::NextIndex requestStatus.push_back ( true ); // stores Matrix base address by index - matrixBase.push_back (base_address); + matrixBase_[rank].push_back (base_address); return Index; } template Int AxpyInterface2::NextIndex -( Int i, Int j, Int dataSize, - std::deque > &coordVectors, +( Int rank, Int i, Int j, Int dataSize, std::deque > &dataVectors, std::deque &requestData, std::deque &requestDataStatus, - std::deque &requestCoord, - std::deque &requestCoordStatus, - std::deque &matrixBase, T * base_address) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) @@ -167,19 +163,19 @@ Int AxpyInterface2::NextIndex dataVectors.resize (Index + 1); dataVectors[Index].resize (dataSize); - coordVectors.resize (Index + 1); - coordVectors[Index].resize (2); - coordVectors[Index][0] = i; - coordVectors[Index][1] = j; + coordVectors_[rank].resize (Index + 1); + coordVectors_[rank][Index].resize (2); + coordVectors_[rank][Index][0] = i; + coordVectors_[rank][Index][1] = j; requestData.push_back (mpi::REQUEST_NULL); requestDataStatus.push_back ( true ); - requestCoord.push_back (mpi::REQUEST_NULL); - requestCoordStatus.push_back ( true ); + sendIJRequests_[rank].push_back (mpi::REQUEST_NULL); + sendIJRequestStatuses_[rank].push_back ( true ); // stores Matrix base address by index - matrixBase.push_back (base_address); + matrixBase_[rank].push_back (base_address); return Index; } @@ -232,21 +228,17 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); const Int index = - NextIndex (i, j, numEntries, - coordVectors_[destination], + NextIndex (destination, i, j, numEntries, sendVectors_[destination], sendRequests_[destination], sendRequestStatuses_[destination], - sendIJRequests_[destination], - sendIJRequestStatuses_[destination], - matrixBase_[destination], XBuffer); DEBUG_ONLY (if (Int (sendVectors_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = sendVectors_[destination][index].data (); + T *sendBuffer = sendVectors_[destination][index].data (); for( Int t=0; t::Put( Matrix& Z, Int i, Int j ) mpi::TaggedISSend (sendBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm (), sendRequests_[destination][index]); + // send coordinates Int *coord = coordVectors_[destination][index].data (); - mpi::TaggedISSend (coord, 2, destination, IJ_TAG, g.VCComm (), + mpi::TaggedISend (coord, 2, destination, COORD_IJ_TAG, g.VCComm (), sendIJRequests_[destination][index]); } receivingRow = (receivingRow + 1) % r; @@ -295,35 +288,28 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) T* XBuffer = Z.Buffer(); // Send out the requests to all processes in the grid - // 0 byte send for (Int rank = 0; rank < p; ++rank) { - const Int bufferSize = 0; + // we just use the request objects for progress const Int index = - NextIndex (i, j, bufferSize, - coordVectors_[rank], + NextIndex (rank, i, j, 1, requestVectors_[rank], requestRequests_[rank], requestRequestStatuses_[rank], - sendIJRequests_[rank], - sendIJRequestStatuses_[rank], - matrixBase_[rank], XBuffer); - - // Copy the request header into the send buffer - T *requestBuffer = requestVectors_[rank][index].data (); - - // Fire off non-blocking send - mpi::TaggedISSend (requestBuffer, bufferSize, rank, - REQUEST_GET_TAG, g.VCComm (), + // send request + T *requestBuffer = requestVectors_[rank][index].data(); + mpi::TaggedISSend (requestBuffer, 1, rank, + REQUEST_GET_TAG, g.VCComm(), requestRequests_[rank][index]); - + // send coordinates Int *coord = coordVectors_[rank][index].data (); - mpi::TaggedISSend (coord, 2, rank, - IJ_TAG, g.VCComm (), + mpi::TaggedISend (coord, 2, rank, + COORD_IJ_TAG, g.VCComm (), sendIJRequests_[rank][index]); } + // Receive all of the replies Int numReplies = 0; while (numReplies < p) @@ -362,7 +348,7 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) YCol[colShift + s * r] = XCol[s]; } ++numReplies; - recvVector_.resize ( 0 ); + recvVector_.clear(); } } } @@ -401,7 +387,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int YLDim = Y.LDim (); + const Int YLDim = Y.LDim(); for( Int step=0; step::Acc( Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); const Int index = - NextIndex (i, j, numEntries, - coordVectors_[destination], + NextIndex (destination, i, j, numEntries, sendVectors_[destination], sendRequests_[destination], sendRequestStatuses_[destination], - sendIJRequests_[destination], - sendIJRequestStatuses_[destination], - matrixBase_[destination], XBuffer); DEBUG_ONLY (if (Int (sendVectors_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - + T *sendBuffer = sendVectors_[destination][index].data (); for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) // send data mpi::TaggedISSend (sendBuffer, numEntries, destination, - DATA_ACC_TAG, g.VCComm (), + DATA_ACC_TAG, g.VCComm(), sendRequests_[destination][index]); - + // send coordinates - Int *coord = coordVectors_[destination][index].data (); - mpi::TaggedISSend (coord, 2, destination, - IJ_TAG, g.VCComm (), + Int *coord = coordVectors_[destination][index].data(); + mpi::TaggedISend (coord, 2, destination, + COORD_IJ_TAG, g.VCComm(), sendIJRequests_[destination][index]); } receivingRow = (receivingRow + 1) % r; @@ -472,10 +454,10 @@ bool AxpyInterface2::TestRequests ( Matrix& Z ) for (int rank = 0; rank < p; ++rank) { dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer ()); + matrixBase_[rank].end(), Z.LockedBuffer()); index = (dit - matrixBase_[rank].begin()); - if ( index == matrixBase_[rank].size () ) + if ( index == matrixBase_[rank].size() ) continue; if ( requestRequestStatuses_[rank].size() == 0 ) continue; @@ -488,7 +470,7 @@ bool AxpyInterface2::TestRequests ( Matrix& Z ) return false; } // okay to deallocate - requestVectors_[rank][index].resize( 0 ); + requestVectors_[rank][index].clear(); } return true; } @@ -507,10 +489,10 @@ bool AxpyInterface2::TestSends ( Matrix& Z ) for (int rank = 0; rank < p; ++rank) { dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer ()); + matrixBase_[rank].end(), Z.LockedBuffer()); index = (dit - matrixBase_[rank].begin()); - if ( index == matrixBase_[rank].size () ) + if ( index == matrixBase_[rank].size() ) continue; if ( sendRequestStatuses_[rank].size() == 0 ) continue; @@ -525,7 +507,7 @@ bool AxpyInterface2::TestSends ( Matrix& Z ) } // if test is true, then it is safe to free buffer - sendVectors_[rank][index].resize (0); + sendVectors_[rank][index].clear(); } return true; } @@ -544,10 +526,10 @@ bool AxpyInterface2::TestRecvs ( Matrix& Z ) for (int rank = 0; rank < p; ++rank) { dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer ()); + matrixBase_[rank].end(), Z.LockedBuffer()); index = (dit - matrixBase_[rank].begin()); - if ( index == matrixBase_[rank].size () ) + if ( index == matrixBase_[rank].size() ) continue; if ( recvRequestStatuses_[rank].size() == 0 ) continue; @@ -562,7 +544,7 @@ bool AxpyInterface2::TestRecvs ( Matrix& Z ) } // if test is true, then it is safe to free buffer - recvVectors_[rank][index].resize (0); + recvVectors_[rank][index].clear(); } return true; } @@ -581,10 +563,10 @@ bool AxpyInterface2::TestReplies ( Matrix& Z ) for (int rank = 0; rank < p; ++rank) { dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer ()); + matrixBase_[rank].end(), Z.LockedBuffer()); index = (dit - matrixBase_[rank].begin()); - if ( index == matrixBase_[rank].size () ) + if ( index == matrixBase_[rank].size() ) continue; if ( replyRequestStatuses_[rank].size() == 0 ) continue; @@ -599,7 +581,7 @@ bool AxpyInterface2::TestReplies ( Matrix& Z ) } // if test is true, then it is safe to free buffer - replyVectors_[rank][index].resize (0); + replyVectors_[rank][index].clear(); } return true; } @@ -618,10 +600,10 @@ bool AxpyInterface2::TestSendsCoord ( Matrix& Z ) for (int rank = 0; rank < p; ++rank) { dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer ()); + matrixBase_[rank].end(), Z.LockedBuffer()); index = (dit - matrixBase_[rank].begin()); - if ( index == matrixBase_[rank].size () ) + if ( index == matrixBase_[rank].size() ) continue; if ( sendIJRequestStatuses_[rank].size() == 0 ) continue; @@ -635,7 +617,7 @@ bool AxpyInterface2::TestSendsCoord ( Matrix& Z ) return false; } // if test is true, then it is safe to free buffer - coordVectors_[rank][index].resize (0); + coordVectors_[rank][index].clear(); } return true; } @@ -661,7 +643,7 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) while ( !DONE ) { // similar to HandleXYZ functions in original AxpyInterface - if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) + if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm(), status) ) { switch (status.MPI_TAG) { @@ -693,10 +675,10 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) // check if all sends (data or request) are // complete for a particular matrix if ( TestSends( Z ) - && TestRecvs( Z ) + && TestRecvs( Z ) && TestRequests( Z ) - && TestReplies ( Z ) - && TestSendsCoord ( Z ) ) + && TestReplies ( Z ) + && TestSendsCoord ( Z ) ) { mpi::IBarrier ( g.VCComm(), nb_bar_request ); nb_bar_active = true; @@ -724,38 +706,38 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int s const Int myCol = g.Col (); int height = Z.Height(); int width = Z.Width(); + // data vector std::vector getVector_; - std::vector getCoord_; + getVector_.resize (count); DEBUG_ONLY (if (count < Int (sizeof (T))) LogicError ("Count was too small");) - - getVector_.resize (count); - getCoord_.resize (2); - - DEBUG_ONLY (if - (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) + DEBUG_ONLY (if (Int (getVector_.size ()) != count) + LogicError ("Not enough space allocated");) - T *getBuffer = getVector_.data (); - // post receive - mpi::TaggedRecv (getBuffer, count, source, DATA_PUT_TAG, g.VCComm ()); - // post receive for coordinates - Int *coord = getCoord_.data(); - mpi::TaggedRecv (coord, 2, source, IJ_TAG, g.VCComm ()); - Int i = coord[0]; Int j = coord[1]; - + Int coord[2]; + mpi::TaggedRecv (coord, 2, source, + COORD_IJ_TAG, g.VCComm()); + Int i = coord[0]; + Int j = coord[1]; + + // post receive for data + T *getBuffer = getVector_.data(); + mpi::TaggedRecv (getBuffer, count, source, + DATA_PUT_TAG, g.VCComm()); + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; const Int colShift = Shift (myRow, colAlign, r); const Int rowShift = Shift (myCol, rowAlign, c); const Int localHeight = Length (height, colShift, r); const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); + const Int iLocalOffset = Length (i, Y.ColShift(), r); + const Int jLocalOffset = Length (j, Y.RowShift(), c); for (Int t = 0; t < localWidth; ++t) { @@ -765,8 +747,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int s YCol[s] = XCol[s]; } // Free the memory - getVector_.resize ( 0 ); - getCoord_.resize ( 0 ); + getVector_.clear(); } // replica of above function except this accumulates @@ -781,41 +762,39 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so const Int myCol = g.Col (); const int height = Z.Height(); const int width = Z.Width(); + // data buffer - std::vector< T > getVector_; - std::vector getCoord_; + std::vector getVector_; + getVector_.resize (count); DEBUG_ONLY (if (count < Int (sizeof (T))) LogicError ("Count was too small");) - // data buffer resize - getVector_.resize (count); - getCoord_.resize (2); - - DEBUG_ONLY (if - (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) - - // post receive for data - T *getBuffer = getVector_.data (); - mpi::TaggedRecv (getBuffer, count, source, - DATA_ACC_TAG, g.VCComm ()); + DEBUG_ONLY (if (Int (getVector_.size ()) != count) + LogicError ("Not enough space allocated");) // post receive for coordinates - Int *coord = getCoord_.data(); - mpi::TaggedRecv (coord, 2, source, IJ_TAG, g.VCComm ()); + Int coord[2]; + mpi::TaggedRecv (coord, 2, source, + COORD_IJ_TAG, g.VCComm()); Int i = coord[0]; Int j = coord[1]; + // post receive for data + T *getBuffer = getVector_.data(); + mpi::TaggedRecv (getBuffer, count, source, + DATA_ACC_TAG, g.VCComm()); + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; const Int colShift = Shift (myRow, colAlign, r); const Int rowShift = Shift (myCol, rowAlign, c); const Int localHeight = Length (height, colShift, r); const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); + const Int iLocalOffset = Length (i, Y.ColShift(), r); + const Int jLocalOffset = Length (j, Y.RowShift(), c); for (Int t = 0; t < localWidth; ++t) { @@ -825,8 +804,7 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so YCol[s] += XCol[s]; } // Free the memory - getVector_.resize ( 0 ); - getCoord_.resize ( 0 ); + getVector_.clear(); } // handle request for data, post a matching issend @@ -845,21 +823,23 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int p = g.Size(); const Int myRow = g.Row(); const Int myCol = g.Col(); - - // dummy vector for 0 byte receive - std::vector dummyVector_; - std::vector getCoord_; - + mpi::Status status; if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) { const Int source = status.MPI_SOURCE; - getCoord_.resize (2); - + // dummy var for receiving request + // we don't use this anyway + T dummy_=0; + // post receive request for get + mpi::TaggedRecv (&dummy_, 1, source, + REQUEST_GET_TAG, g.VCComm()); + // post receive for coordinates - Int *coord = getCoord_.data(); - mpi::TaggedRecv (coord, 2, source, IJ_TAG, g.VCComm ()); + Int coord[2]; + mpi::TaggedRecv (coord, 2, source, + COORD_IJ_TAG, g.VCComm()); Int i = coord[0]; Int j = coord[1]; const Int colAlign = (Y.ColAlign() + i) % r; @@ -883,26 +863,18 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) DEBUG_ONLY (if (numEntries < Int (sizeof (T))) LogicError ("Count was too small");) - // dummy buffers resize - dummyVector_.resize ( 0 ); - T *dummyBuffer = dummyVector_.data (); - // post receive request for get - mpi::TaggedRecv (dummyBuffer, 0, source, - REQUEST_GET_TAG, g.VCComm ()); - T* XBuffer = Z.Buffer(); const Int index = - NextIndex (numEntries, replyVectors_[source], + NextIndex (source, numEntries, replyVectors_[source], replyRequests_[source], replyRequestStatuses_[source], - matrixBase_[source], XBuffer); DEBUG_ONLY (if (Int (replyVectors_[source][index].size ()) != numEntries) LogicError ("Error in NextIndex");) + T *replyBuffer = replyVectors_[source][index].data (); - for (Int t = 0; t < localWidth; ++t) { T *sendCol = &replyBuffer[t * localHeight]; @@ -912,10 +884,8 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) // Fire off non-blocking send mpi::TaggedISSend (replyBuffer, numEntries, source, - DATA_GET_TAG, g.VCComm (), replyRequests_[source][index]); - - dummyVector_.resize ( 0 ); - getCoord_.resize ( 0 ); + DATA_GET_TAG, g.VCComm (), + replyRequests_[source][index]); } } From 757eb3c1c0fa099d4e14127aab786ed8e181878a Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Sun, 17 Aug 2014 23:56:04 -0500 Subject: [PATCH 073/110] redesigned the nextindex logic completely, I got to do the same thing for rma now --- include/El/core/AxpyInterface2.0.hpp | 87 ++--- src/core/AxpyInterface2.0.cpp | 533 +++++++++++---------------- 2 files changed, 234 insertions(+), 386 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 3731d9a101..9e4e1928df 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -47,33 +47,19 @@ class AxpyInterface2 DATA_ACC_TAG =3, REQUEST_GET_TAG =4, COORD_IJ_TAG =5; - - /* Request objects for send, recv and request op */ - std::vector> - sendRequests_, requestRequests_, - recvRequests_, replyRequests_, - sendIJRequests_; - - /* Request statuses for send, recv and request op */ - std::vector> - sendRequestStatuses_, - recvRequestStatuses_, - requestRequestStatuses_, - replyRequestStatuses_, - sendIJRequestStatuses_; - - /* Stores matrix base addresses */ - std::vector> - matrixBase_; - /* Stores i, j coordinates */ - std::vector>> - coordVectors_; - - /* Receive and Send vectors */ - std::vector>> - recvVectors_, sendVectors_, - replyVectors_, requestVectors_; + struct matrix_params_ + { + T *base_; + std::vector>> + data_; + std::vector> + requests_; + std::vector> + statuses_; + }; + + std::vector matrices_; // need to add const here... DistMatrix* GlobalArrayPut_; @@ -82,44 +68,29 @@ class AxpyInterface2 bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; - Int NextIndex ( Int rank, Int dataSize, - std::deque > &dataVectors, - std::deque &requests, - std::deque &requestStatus, - T * base_address ); - // note: this is just a placeholder - // would be replaced soon - Int NextIndex ( Int rank, Int dataSize, - std::deque > &dataVectors, - std::deque &requests, - std::deque &requestStatus, - const T * base_address ); - - Int NextIndex - ( Int rank, Int i, Int j, Int dataSize, - std::deque > &dataVectors, - std::deque &requestData, - std::deque &requestDataStatus, - T * base_address); - - Int NextIndex - ( Int rank, Int i, Int j, Int dataSize, - std::deque > &dataVectors, - std::deque &requestData, - std::deque &requestDataStatus, - const T * base_address); - + Int NextIndex ( + Int target, + Int dataSize, + T * base_address, + Int *matrix_index); + + Int NextIndex ( + Int target, + Int dataSize, + const T * base_address, + Int *matrix_index); + /* Test */ - // probably we need const interfaces also? bool TestRequests ( Matrix& Z ); - bool TestReplies ( Matrix& Z ); - bool TestSends ( Matrix& Z ); - bool TestRecvs ( Matrix& Z ); - bool TestSendsCoord ( Matrix& Z ); + bool TestRequests ( const Matrix& Z ); void HandleGlobalToLocalData( Matrix& Z ); void HandleLocalToGlobalData( Matrix& Z, Int count, Int source ); void HandleLocalToGlobalAcc( Matrix& Z, Int count, Int source ); + + void HandleGlobalToLocalData( const Matrix& Z ); + void HandleLocalToGlobalData( const Matrix& Z, Int count, Int source ); + void HandleLocalToGlobalAcc( const Matrix& Z, Int count, Int source ); }; } // namespace El #endif // ifndef EL_AXPYINTERFACE2_HPP diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index a436a11e20..fd00f086a6 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -14,13 +14,9 @@ namespace El template AxpyInterface2::AxpyInterface2() : GlobalArrayPut_(0), GlobalArrayGet_(0), - sendVectors_(0), recvVectors_(0), replyVectors_(0), coordVectors_(0), - requestVectors_(0), sendRequests_(0), recvRequests_(0), - replyRequests_(0), requestRequests_(0), sendIJRequests_(0), - matrixBase_(0), sendRequestStatuses_(0), requestRequestStatuses_(0), - replyRequestStatuses_(0), recvRequestStatuses_(0), - sendIJRequestStatuses_(0), toBeAttachedForPut_(false), - toBeAttachedForGet_(false), attached_(false), detached_(false) + matrices_(0), + toBeAttachedForGet_(false), toBeAttachedForPut_(false), + attached_(false), detached_(false) { } template @@ -35,27 +31,20 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) GlobalArrayPut_ = &Z; GlobalArrayGet_ = &Z; - const Int p = Z.Grid ().Size(); - - requestVectors_.resize( p ); - sendVectors_.resize( p ); - recvVectors_.resize( p ); - replyVectors_.resize( p ); - coordVectors_.resize( p ); - - sendRequests_.resize (p); - recvRequests_.resize (p); - replyRequests_.resize (p); - requestRequests_.resize (p); - sendIJRequests_.resize (p); - - sendRequestStatuses_.resize (p); - recvRequestStatuses_.resize (p); - requestRequestStatuses_.resize (p); - replyRequestStatuses_.resize (p); - sendIJRequestStatuses_.resize (p); - - matrixBase_.resize (p); + if ( matrices_.empty() ) + { + const Grid& g = Z.Grid(); + const Int p = g.Size (); + + struct matrix_params_ mp; + mp.data_.resize(p); + mp.requests_.resize(p); + mp.statuses_.resize(p); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } } template @@ -89,95 +78,117 @@ void AxpyInterface2::Attach( DistMatrix& Z ) attached_ = true; else LogicError("Must detach before reattaching."); + + // the matrix base_ is not known until + // an update operation (put/get/acc) + // so it is kept blank // if DistMatrix is non-const, all one-sided // transfers -- put, get and acc are possible if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) { - GlobalArrayPut_ = &Z; - toBeAttachedForPut_ = true; - GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; - } - const Grid& g = Z.Grid(); - const Int p = g.Size (); + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; - if (sendVectors_.size() != p) - { - recvVectors_.resize( p ); - sendVectors_.resize( p ); - replyVectors_.resize( p ); - requestVectors_.resize( p ); - coordVectors_.resize( p ); - - sendRequests_.resize (p); - recvRequests_.resize (p); - requestRequests_.resize (p); - replyRequests_.resize (p); - sendIJRequests_.resize (p); - - sendRequestStatuses_.resize (p); - recvRequestStatuses_.resize (p); - requestRequestStatuses_.resize (p); - replyRequestStatuses_.resize (p); - sendIJRequestStatuses_.resize (p); - - matrixBase_.resize (p); + const Grid& g = Z.Grid(); + const Int p = g.Size (); + + if ( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize(p); + mp.requests_.resize(p); + mp.statuses_.resize(p); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } } } template -Int AxpyInterface2::NextIndex -( Int rank, Int dataSize, - std::deque > &dataVectors, - std::deque &requests, - std::deque &requestStatus, - T * base_address ) +Int AxpyInterface2::NextIndex ( + Int target, + Int dataSize, + T * base_address, + Int *mindex) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) - const Int Index = Int(requests.size ()); - dataVectors.resize (Index + 1); - dataVectors[Index].resize (dataSize); + assert ( base_address != NULL ); - requests.push_back (mpi::REQUEST_NULL); - requestStatus.push_back ( true ); - - // stores Matrix base address by index - matrixBase_[rank].push_back (base_address); + Int matrixIndex = 0; + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + const Int numMatrices = matrices_.size(); + + // search for matrix base + for (Int m = 0; m < numMatrices; m++) + { + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + if ( matrices_[m].base_ == NULL ) + { + matrices_[m].base_ = base_address; + matrixIndex = m; + break; + } + matrixIndex = m+1; + } - return Index; -} + // need to create new object + if ( matrixIndex == numMatrices) + { + struct matrix_params_ mp; + mp.data_.resize(p); + mp.requests_.resize(p); + mp.statuses_.resize(p); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + matrices_[matrixIndex].base_ = base_address; + } + // go through the request, data, + // status objects + const Int numCreated = matrices_[matrixIndex].data_[target].size (); + DEBUG_ONLY (if (numCreated != Int (matrices_[matrixIndex].requests_[target].size ()) || + numCreated != Int (matrices_[matrixIndex].statuses_[target].size ())) + LogicError ("size mismatch");) + + for (Int i = 0; i < numCreated; ++i) + { + // If this request is still running, + // test to see if it finished. + if (matrices_[matrixIndex].statuses_[target][i]) + { + const bool finished = mpi::Test (matrices_[matrixIndex].requests_[target][i]); + matrices_[matrixIndex].statuses_[target][i] = !finished; + } -template -Int AxpyInterface2::NextIndex -( Int rank, Int i, Int j, Int dataSize, - std::deque > &dataVectors, - std::deque &requestData, - std::deque &requestDataStatus, - T * base_address) -{ - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) - const Int Index = Int(requestData.size ()); - - dataVectors.resize (Index + 1); - dataVectors[Index].resize (dataSize); + if (!matrices_[matrixIndex].statuses_[target][i]) + { + matrices_[matrixIndex].statuses_[target][i] = true; + matrices_[matrixIndex].data_[target][i].resize ( dataSize ); + *mindex = matrixIndex; + return i; + } + } - coordVectors_[rank].resize (Index + 1); - coordVectors_[rank][Index].resize (2); - coordVectors_[rank][Index][0] = i; - coordVectors_[rank][Index][1] = j; - - requestData.push_back (mpi::REQUEST_NULL); - requestDataStatus.push_back ( true ); - - sendIJRequests_[rank].push_back (mpi::REQUEST_NULL); - sendIJRequestStatuses_[rank].push_back ( true ); - - // stores Matrix base address by index - matrixBase_[rank].push_back (base_address); - - return Index; + matrices_[matrixIndex].data_[target].resize ( numCreated + 1 ); + matrices_[matrixIndex].data_[target][numCreated].resize ( dataSize ); + matrices_[matrixIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); + matrices_[matrixIndex].statuses_[target].push_back ( true ); + *mindex = matrixIndex; + + return numCreated; } template @@ -213,6 +224,8 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) Int receivingCol = myProcessCol; const Int YLDim = Y.LDim (); + + Int matrix_index; for( Int step=0; step::Put( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); - const Int index = - NextIndex (destination, i, j, numEntries, - sendVectors_[destination], - sendRequests_[destination], - sendRequestStatuses_[destination], - XBuffer); + const Int dindex = + NextIndex (destination, + numEntries, + XBuffer, + &matrix_index); DEBUG_ONLY (if - (Int (sendVectors_[destination][index].size ()) != + (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = sendVectors_[destination][index].data (); + T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); for( Int t=0; t::Put( Matrix& Z, Int i, Int j ) // put request mpi::TaggedISSend (sendBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm (), - sendRequests_[destination][index]); - + matrices_[matrix_index].requests_[destination][dindex]); + + const Int cindex = + NextIndex (destination, + 2, + XBuffer, + &matrix_index); // send coordinates - Int *coord = coordVectors_[destination][index].data (); + Int *coord = reinterpret_cast(matrices_[matrix_index].data_[destination][cindex].data ()); + coord[0] = i; coord[1] = j; mpi::TaggedISend (coord, 2, destination, COORD_IJ_TAG, g.VCComm (), - sendIJRequests_[destination][index]); + matrices_[matrix_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -286,28 +304,35 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) std::vector recvVector_; + Int matrix_index; + T* XBuffer = Z.Buffer(); // Send out the requests to all processes in the grid for (Int rank = 0; rank < p; ++rank) { // we just use the request objects for progress - const Int index = - NextIndex (rank, i, j, 1, - requestVectors_[rank], - requestRequests_[rank], - requestRequestStatuses_[rank], - XBuffer); + const Int dindex = + NextIndex (rank, + 1, + XBuffer, + &matrix_index); // send request - T *requestBuffer = requestVectors_[rank][index].data(); - mpi::TaggedISSend (requestBuffer, 1, rank, + T *requestBuffer = matrices_[matrix_index].data_[rank][dindex].data(); + mpi::TaggedISend (requestBuffer, 1, rank, REQUEST_GET_TAG, g.VCComm(), - requestRequests_[rank][index]); + matrices_[matrix_index].requests_[rank][dindex]); + const Int cindex = + NextIndex (rank, + 2, + XBuffer, + &matrix_index); // send coordinates - Int *coord = coordVectors_[rank][index].data (); + Int *coord = reinterpret_cast(matrices_[matrix_index].data_[rank][cindex].data ()); + coord[0] = i; coord[1] = j; mpi::TaggedISend (coord, 2, rank, COORD_IJ_TAG, g.VCComm (), - sendIJRequests_[rank][index]); + matrices_[matrix_index].requests_[rank][cindex]); } // Receive all of the replies @@ -366,6 +391,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) LogicError("Submatrix out of bounds of global matrix"); @@ -378,7 +404,9 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - + + Int matrix_index; + const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); @@ -402,18 +430,17 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); - const Int index = - NextIndex (destination, i, j, numEntries, - sendVectors_[destination], - sendRequests_[destination], - sendRequestStatuses_[destination], - XBuffer); + const Int dindex = + NextIndex (destination, + numEntries, + XBuffer, + &matrix_index); DEBUG_ONLY (if - (Int (sendVectors_[destination][index].size ()) != + (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != numEntries) LogicError ("Error in NextIndex");) - - T *sendBuffer = sendVectors_[destination][index].data (); + + T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) // send data mpi::TaggedISSend (sendBuffer, numEntries, destination, DATA_ACC_TAG, g.VCComm(), - sendRequests_[destination][index]); - + matrices_[matrix_index].requests_[destination][dindex]); + + const Int cindex = + NextIndex (destination, + 2, + XBuffer, + &matrix_index); // send coordinates - Int *coord = coordVectors_[destination][index].data(); + Int *coord = reinterpret_cast(matrices_[matrix_index].data_[destination][cindex].data()); + coord[0] = i; coord[1] = j; mpi::TaggedISend (coord, 2, destination, COORD_IJ_TAG, g.VCComm(), - sendIJRequests_[destination][index]); + matrices_[matrix_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -439,6 +472,10 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) } } +// free requests - this is necessary for freeing +// requests of ISends after successful Recvs, +// because + // progress communication for a particular matrix // progress requests template @@ -447,180 +484,40 @@ bool AxpyInterface2::TestRequests ( Matrix& Z ) DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); const Int p = g.Size(); - - Int index; - typename std::deque::iterator dit; - - for (int rank = 0; rank < p; ++rank) - { - dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer()); - index = (dit - matrixBase_[rank].begin()); - - if ( index == matrixBase_[rank].size() ) - continue; - if ( requestRequestStatuses_[rank].size() == 0 ) - continue; - // test all send requests related to matrix - const Int numStatuses = requestRequestStatuses_[rank].size(); - for (int i = 0; i < numStatuses; i++) - { - requestRequestStatuses_[rank][i] = !mpi::Test ( requestRequests_[rank][i] ); - if ( requestRequestStatuses_[rank][i] ) - return false; - } - // okay to deallocate - requestVectors_[rank][index].clear(); - } - return true; -} - -// progress sends -template -bool AxpyInterface2::TestSends ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayPut_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - - Int index; - typename std::deque::iterator dit; - - for (int rank = 0; rank < p; ++rank) - { - dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer()); - index = (dit - matrixBase_[rank].begin()); - - if ( index == matrixBase_[rank].size() ) - continue; - if ( sendRequestStatuses_[rank].size() == 0 ) - continue; - - // test all sends related to matrix - const Int numStatuses = sendRequestStatuses_[rank].size(); - for (int i = 0; i < numStatuses; i++) - { - sendRequestStatuses_[rank][i] = !mpi::Test ( sendRequests_[rank][i] ); - if ( sendRequestStatuses_[rank][i] ) - return false; - } - - // if test is true, then it is safe to free buffer - sendVectors_[rank][index].clear(); - } - return true; -} - -// progress recvs -template -bool AxpyInterface2::TestRecvs ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - - Int index; - typename std::deque::iterator dit; + const Int numMatrices = matrices_.size(); + Int matrixIndex; + const T *base_address = Z.LockedBuffer(); - for (int rank = 0; rank < p; ++rank) + // search for matrix base + for (Int m = 0; m < numMatrices; m++) { - dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer()); - index = (dit - matrixBase_[rank].begin()); - - if ( index == matrixBase_[rank].size() ) - continue; - if ( recvRequestStatuses_[rank].size() == 0 ) - continue; - - // test all sends related to matrix - const Int numStatuses = recvRequestStatuses_[rank].size(); - for (int i = 0; i < numStatuses; i++) + if ( matrices_[m].base_ == base_address ) { - recvRequestStatuses_[rank][i] = !mpi::Test ( recvRequests_[rank][i] ); - if ( recvRequestStatuses_[rank][i] ) - return false; + matrixIndex = m; + break; } - - // if test is true, then it is safe to free buffer - recvVectors_[rank][index].clear(); + matrixIndex = m+1; } - return true; -} -// progress replies -template -bool AxpyInterface2::TestReplies ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayPut_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - - Int index; - typename std::deque::iterator dit; + // matrix not found + if ( matrixIndex == numMatrices) + return true; for (int rank = 0; rank < p; ++rank) { - dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer()); - index = (dit - matrixBase_[rank].begin()); - - if ( index == matrixBase_[rank].size() ) + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - if ( replyRequestStatuses_[rank].size() == 0 ) - continue; - - // test all sends related to matrix - const Int numStatuses = replyRequestStatuses_[rank].size(); - for (int i = 0; i < numStatuses; i++) + const Int numStatuses = matrices_[matrixIndex].requests_[rank].size (); + for (int i = 0; i < numStatuses; i++) { - replyRequestStatuses_[rank][i] = !mpi::Test ( replyRequests_[rank][i] ); - if ( replyRequestStatuses_[rank][i] ) + matrices_[matrixIndex].statuses_[rank][i] = !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + if ( matrices_[matrixIndex].statuses_[rank][i] ) return false; } - - // if test is true, then it is safe to free buffer - replyVectors_[rank][index].clear(); } return true; } -// progress coordinate sends -template -bool AxpyInterface2::TestSendsCoord ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayPut_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - - Int index; - typename std::deque::iterator dit; - - for (int rank = 0; rank < p; ++rank) - { - dit = std::find ( matrixBase_[rank].begin(), - matrixBase_[rank].end(), Z.LockedBuffer()); - index = (dit - matrixBase_[rank].begin()); - - if ( index == matrixBase_[rank].size() ) - continue; - if ( sendIJRequestStatuses_[rank].size() == 0 ) - continue; - - // test all sends related to matrix - const Int numStatuses = sendIJRequestStatuses_[rank].size(); - for (int i = 0; i < numStatuses; i++) - { - sendIJRequestStatuses_[rank][i] = !mpi::Test ( sendIJRequests_[rank][i] ); - if ( sendIJRequestStatuses_[rank][i] ) - return false; - } - // if test is true, then it is safe to free buffer - coordVectors_[rank][index].clear(); - } - return true; -} // flush ensures local and remote completion // this interface assumes a send has been issued // and will post a matching receive and progress @@ -674,11 +571,7 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) { // check if all sends (data or request) are // complete for a particular matrix - if ( TestSends( Z ) - && TestRecvs( Z ) - && TestRequests( Z ) - && TestReplies ( Z ) - && TestSendsCoord ( Z ) ) + if ( TestRequests( Z ) ) { mpi::IBarrier ( g.VCComm(), nb_bar_request ); nb_bar_active = true; @@ -824,6 +717,8 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int myRow = g.Row(); const Int myCol = g.Col(); + Int matrix_index; + mpi::Status status; if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) @@ -831,13 +726,13 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int source = status.MPI_SOURCE; // dummy var for receiving request // we don't use this anyway - T dummy_=0; + T dummy_[1]; // post receive request for get - mpi::TaggedRecv (&dummy_, 1, source, + mpi::TaggedRecv (dummy_, 1, source, REQUEST_GET_TAG, g.VCComm()); // post receive for coordinates - Int coord[2]; + Int coord[2] = {-1, -1}; mpi::TaggedRecv (coord, 2, source, COORD_IJ_TAG, g.VCComm()); Int i = coord[0]; Int j = coord[1]; @@ -865,16 +760,16 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) T* XBuffer = Z.Buffer(); const Int index = - NextIndex (source, numEntries, replyVectors_[source], - replyRequests_[source], - replyRequestStatuses_[source], - XBuffer); + NextIndex (source, + numEntries, + XBuffer, + &matrix_index); DEBUG_ONLY (if - (Int (replyVectors_[source][index].size ()) != + (Int (matrices_[matrix_index].data_[source][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *replyBuffer = replyVectors_[source][index].data (); + T *replyBuffer = matrices_[matrix_index].data_[source][index].data (); for (Int t = 0; t < localWidth; ++t) { T *sendCol = &replyBuffer[t * localHeight]; @@ -885,7 +780,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) // Fire off non-blocking send mpi::TaggedISSend (replyBuffer, numEntries, source, DATA_GET_TAG, g.VCComm (), - replyRequests_[source][index]); + matrices_[matrix_index].requests_[source][index]); } } @@ -914,25 +809,7 @@ void AxpyInterface2::Detach() GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - sendVectors_.clear(); - coordVectors_.clear(); - recvVectors_.clear(); - replyVectors_.clear(); - requestVectors_.clear(); - - sendIJRequests_.clear(); - sendRequests_.clear(); - recvRequests_.clear(); - requestRequests_.clear(); - replyRequests_.clear(); - - sendRequestStatuses_.clear(); - sendIJRequestStatuses_.clear(); - recvRequestStatuses_.clear(); - requestRequestStatuses_.clear(); - replyRequestStatuses_.clear(); - - matrixBase_.clear(); + matrices_.clear(); } template class AxpyInterface2; From 924b36cffce441fedc34e97cc869c8773855e721 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 18 Aug 2014 11:34:40 -0500 Subject: [PATCH 074/110] flush modified, snipped some junk code...however, we should move away from win_flush in flush, and use other fine grain synch methods, I am going to soon make some changes in regards... --- include/El/core/RmaInterface.hpp | 9 +- src/core/RmaInterface.cpp | 332 +++++++++---------------------- 2 files changed, 99 insertions(+), 242 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 147f8d819b..e43448af52 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -52,15 +52,18 @@ class RmaInterface std::vector>> getVector_, putVector_; + std::vector> + putStatus_, getStatus_; + DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; bool toBeAttachedForPut_, toBeAttachedForGet_, - attached_, detached_, preceeding_put_, - preceeding_get_; + attached_, detached_; Int NextIndex ( Int dataSize, - std::deque > &dataVectors ); + std::deque > &dataVectors, + std::deque &statuses); }; #endif //MPI-3 } // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 38ebb86299..fffe820470 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -29,8 +29,7 @@ RmaInterface::RmaInterface() : GlobalArrayPut_(0), GlobalArrayGet_(0), putVector_(0), getVector_(0), window (MPI_WIN_NULL), toBeAttachedForPut_(false), toBeAttachedForGet_(false), - attached_(false), detached_(false), preceeding_get_(false), - preceeding_put_(false) + attached_(false), detached_(false) { } template @@ -42,15 +41,17 @@ RmaInterface::RmaInterface( DistMatrix& Z ) detached_ = false; toBeAttachedForGet_ = true; toBeAttachedForPut_ = true; - preceeding_put_ = false; - preceeding_get_ = false; GlobalArrayPut_ = &Z; GlobalArrayGet_ = &Z; window = MPI_WIN_NULL; const Int p = Z.Grid ().Size(); + putVector_.resize( p ); getVector_.resize( p ); + + putStatus_.resize( p ); + getStatus_.resize( p ); } template @@ -62,15 +63,17 @@ RmaInterface::RmaInterface( const DistMatrix& X ) detached_ = false; toBeAttachedForGet_ = true; toBeAttachedForPut_ = false; - preceeding_put_ = false; - preceeding_get_ = false; GlobalArrayGet_ = &X; GlobalArrayPut_ = 0; window = MPI_WIN_NULL; const Int p = X.Grid ().Size (); + getVector_.resize( p ); putVector_.resize( p ); + + putStatus_.resize( p ); + getStatus_.resize( p ); } template @@ -123,6 +126,9 @@ void RmaInterface::Attach( DistMatrix& Z ) { getVector_.resize( p ); putVector_.resize( p ); + + putStatus_.resize( p ); + getStatus_.resize( p ); } // do rma related checks @@ -161,6 +167,9 @@ void RmaInterface::Attach( const DistMatrix& X ) { getVector_.resize( p ); putVector_.resize( p ); + + putStatus_.resize( p ); + getStatus_.resize( p ); } //do rma related checks @@ -176,15 +185,28 @@ void RmaInterface::Attach( const DistMatrix& X ) template Int RmaInterface::NextIndex ( Int dataSize, - std::deque > &dataVectors ) + std::deque > &dataVectors, + std::deque &statuses ) { DEBUG_ONLY (CallStackEntry cse ("RmaInterface::NextIndex")) - const Int Index = Int(dataVectors.size ()); - - dataVectors.resize (Index + 1); - dataVectors[Index].resize (dataSize); - - return Index; + const Int numCreated = dataVectors.size (); + DEBUG_ONLY (if (numCreated != Int (statuses.size ())) + ("size mismatch");) + // check if we may reuse an existing index + for (Int i = 0; i < numCreated; ++i) + { + if (!statuses[i]) + { + statuses[i] = true; + dataVectors[i].resize (dataSize); + return i; + } + } + dataVectors.resize (numCreated + 1); + dataVectors[numCreated].resize (dataSize); + statuses.push_back (true); + + return numCreated; } template @@ -237,15 +259,14 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination] ); + putVector_[destination], putStatus_[destination] ); T* sendBuffer = putVector_[destination][index].data(); - T* sendData = reinterpret_cast(sendBuffer); T* XBuffer = Z.Buffer(); for( Int t=0; t::Put( Matrix& Z, Int i, Int j ) } // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); - // clear - putVector_[destination][index].resize (0); + putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -318,14 +338,13 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination] ); + putVector_[destination], putStatus_[destination] ); T* sendBuffer = putVector_[destination][index].data(); - T* sendData = reinterpret_cast(sendBuffer); const T* XBuffer = Z.LockedBuffer(); for( Int t=0; t::Put( const Matrix& Z, Int i, Int j ) } // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); - // clear - putVector_[destination][index].resize (0); + putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -399,7 +417,7 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - getVector_[destination] ); + getVector_[destination], getStatus_[destination]); T *getBuffer = getVector_[destination][index].data (); // get @@ -412,17 +430,15 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) // no difference between localflush // and flush for Get mpi::FlushLocal (destination, window); - T* getData = reinterpret_cast(getBuffer); + getStatus_[destination][index] = false; // update local matrix for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination] ); + putVector_[destination], putStatus_[destination] ); T* sendBuffer = putVector_[destination][index].data(); - T* sendData = reinterpret_cast(sendBuffer); T* XBuffer = Z.Buffer(); for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) } // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); - // clear - putVector_[destination][index].resize (0); + putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } + #ifdef EL_EXPLICIT_PROGRESS RmaProgress (g.VCComm ()); #endif @@ -565,14 +580,13 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination] ); + putVector_[destination], putStatus_[destination]); T* sendBuffer = putVector_[destination][index].data(); - T* sendData = reinterpret_cast(sendBuffer); const T* XBuffer = Z.LockedBuffer(); for( Int t=0; t::Acc( const Matrix& Z, Int i, Int j ) } // local flush, okay to clear buffers after this mpi::FlushLocal (destination, window); - // clear - putVector_[destination][index].resize (0); + putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } + #ifdef EL_EXPLICIT_PROGRESS RmaProgress (g.VCComm ()); #endif } -// local accumulate, Z += Get Y(i:i+height-1,j:j+width-1), -// where Z is local matrix height x width -template -void RmaInterface::LocalAcc( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::LocalAcc")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - - const DistMatrix &X = *GlobalArrayGet_; - - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - if (i + height > X.Height () || j + width > X.Width ()) - LogicError("Submatrix out of bounds of global matrix"); - - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); - - const Int XLDim = X.LDim (); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - - for( Int step=0; step::NextIndex ( numEntries, - getVector_[destination] ); - T *getBuffer = getVector_[destination][index].data (); - - // get - for( Int t=0; t(getBuffer); - // update local matrix - for( Int t=0; t void RmaInterface::Flush( Matrix& Z, Int i, Int j ) { @@ -690,38 +620,19 @@ void RmaInterface::Flush( Matrix& Z, Int i, Int j ) //do rma related checks const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; for( Int step=0; step::Flush( const Matrix& Z, Int i, Int j ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - const DistMatrix& Y = *GlobalArrayGet_; + DistMatrix& Y = *GlobalArrayPut_; //do rma related checks const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; for( Int step=0; step::Flush( const Matrix& Z, Int i, Int j ) template void RmaInterface::Flush( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) @@ -784,41 +675,22 @@ void RmaInterface::Flush( Matrix& Z ) DistMatrix& Y = *GlobalArrayPut_; - //do rma related checks const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - // i = j = 0 - leftmost coordinates of DistMatrix - const Int colAlign = Y.ColAlign() % r; - const Int rowAlign = Y.RowAlign() % c; - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; + // flush all + mpi::Flush ( window ); + + // clear statuses for( Int step=0; step::Flush( const Matrix& Z ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - // rma checks, see if Z is not NULL, etc - const DistMatrix& Y = *GlobalArrayGet_; - - //do rma related checks + DistMatrix& Y = *GlobalArrayPut_; const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - // i = j = 0 - leftmost coordinates of DistMatrix - const Int colAlign = Y.ColAlign() % r; - const Int rowAlign = Y.RowAlign() % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; + // flush all + mpi::Flush ( window ); + + // clear statuses for( Int step=0; step::Detach() putVector_.clear(); getVector_.clear(); + putStatus_.clear(); + getStatus_.clear(); + mpi::WindowUnlock (window); mpi::WindowFree (window); } From e34ad17e934054ada636cdce29a2ca87d9281175 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 18 Aug 2014 14:39:36 -0500 Subject: [PATCH 075/110] separate struct to handle data and coord...we send coord and data separately --- include/El/core/AxpyInterface2.0.hpp | 42 ++++- src/core/AxpyInterface2.0.cpp | 236 ++++++++++++++++++++++----- 2 files changed, 231 insertions(+), 47 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 9e4e1928df..a71cb57f71 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -48,6 +48,7 @@ class AxpyInterface2 REQUEST_GET_TAG =4, COORD_IJ_TAG =5; + // struct for passing data struct matrix_params_ { T *base_; @@ -61,6 +62,20 @@ class AxpyInterface2 std::vector matrices_; + // struct for passing coordinates + struct coord_params_ + { + T *base_; + std::vector>> + coord_; + std::vector> + requests_; + std::vector> + statuses_; + }; + + std::vector coords_; + // need to add const here... DistMatrix* GlobalArrayPut_; DistMatrix* GlobalArrayGet_; @@ -68,22 +83,37 @@ class AxpyInterface2 bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; - Int NextIndex ( + Int NextIndexMatrix ( Int target, Int dataSize, T * base_address, Int *matrix_index); - Int NextIndex ( + Int NextIndexMatrix ( Int target, Int dataSize, const T * base_address, Int *matrix_index); - + + Int NextIndexCoord ( + Int i, Int j, + Int target, + T * base_address, + Int *matrix_index); + + Int NextIndexCoord ( + Int i, Int j, + Int target, + const T * base_address, + Int *matrix_index); + /* Test */ - bool TestRequests ( Matrix& Z ); - bool TestRequests ( const Matrix& Z ); - + bool TestMatrix ( Matrix& Z ); + bool TestMatrix ( const Matrix& Z ); + + bool TestCoord ( Matrix& Z ); + bool TestCoord ( const Matrix& Z ); + void HandleGlobalToLocalData( Matrix& Z ); void HandleLocalToGlobalData( Matrix& Z, Int count, Int source ); void HandleLocalToGlobalAcc( Matrix& Z, Int count, Int source ); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index fd00f086a6..86a4ed8558 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -14,7 +14,7 @@ namespace El template AxpyInterface2::AxpyInterface2() : GlobalArrayPut_(0), GlobalArrayGet_(0), - matrices_(0), + matrices_(0), coords_(0), toBeAttachedForGet_(false), toBeAttachedForPut_(false), attached_(false), detached_(false) { } @@ -31,11 +31,11 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) GlobalArrayPut_ = &Z; GlobalArrayGet_ = &Z; + const Grid& g = Z.Grid(); + const Int p = g.Size (); + if ( matrices_.empty() ) { - const Grid& g = Z.Grid(); - const Int p = g.Size (); - struct matrix_params_ mp; mp.data_.resize(p); mp.requests_.resize(p); @@ -45,6 +45,18 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) // with default constructor matrices_.push_back( mp ); } + + if ( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize(p); + cp.requests_.resize(p); + cp.statuses_.resize(p); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + } } template @@ -106,17 +118,29 @@ void AxpyInterface2::Attach( DistMatrix& Z ) // with default constructor matrices_.push_back( mp ); } + + if ( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize(p); + cp.requests_.resize(p); + cp.statuses_.resize(p); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + } } } template -Int AxpyInterface2::NextIndex ( +Int AxpyInterface2::NextIndexMatrix ( Int target, Int dataSize, T * base_address, Int *mindex) { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndex")) + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexMatrix")) assert ( base_address != NULL ); @@ -191,6 +215,90 @@ Int AxpyInterface2::NextIndex ( return numCreated; } +template +Int AxpyInterface2::NextIndexCoord ( + Int i, Int j, + Int target, + T * base_address, + Int *cindex) +{ + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexCoord")) + + assert ( base_address != NULL ); + + Int coordIndex = 0; + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + const Int numCoords = coords_.size(); + + // search for matrix base + for (Int m = 0; m < numCoords; m++) + { + if ( coords_[m].base_ == base_address ) + { + coordIndex = m; + break; + } + if ( coords_[m].base_ == NULL ) + { + coords_[m].base_ = base_address; + coordIndex = m; + break; + } + coordIndex = m+1; + } + + // need to create new object + if ( coordIndex == numCoords ) + { + struct coord_params_ cp; + cp.coord_.resize(p); + cp.requests_.resize(p); + cp.statuses_.resize(p); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + coords_[coordIndex].base_ = base_address; + } + // go through the request, data, + // status objects + const Int numCreated = coords_[coordIndex].coord_[target].size (); + DEBUG_ONLY (if (numCreated != Int (coords_[coordIndex].requests_[target].size ()) || + numCreated != Int (matrices_[coordIndex].statuses_[target].size ())) + LogicError ("size mismatch");) + + for (Int i = 0; i < numCreated; ++i) + { + // If this request is still running, + // test to see if it finished. + if (coords_[coordIndex].statuses_[target][i]) + { + const bool finished = mpi::Test (coords_[coordIndex].requests_[target][i]); + coords_[coordIndex].statuses_[target][i] = !finished; + } + + if (!coords_[coordIndex].statuses_[target][i]) + { + coords_[coordIndex].statuses_[target][i] = true; + coords_[coordIndex].coord_[target][i][0] = i; + coords_[coordIndex].coord_[target][i][1] = j; + *cindex = coordIndex; + return i; + } + } + + coords_[coordIndex].coord_[target].resize ( numCreated + 1 ); + coords_[coordIndex].coord_[target][numCreated][0] = i; + coords_[coordIndex].coord_[target][numCreated][1] = j; + coords_[coordIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); + coords_[coordIndex].statuses_[target].push_back ( true ); + *cindex = coordIndex; + + return numCreated; +} + template void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) { @@ -225,7 +333,7 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) const Int YLDim = Y.LDim (); - Int matrix_index; + Int matrix_index, coord_index; for( Int step=0; step::Put( Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); const Int dindex = - NextIndex (destination, + NextIndexMatrix (destination, numEntries, XBuffer, &matrix_index); DEBUG_ONLY (if (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndex");) + numEntries) LogicError ("Error in NextIndexMatrix");) T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); for( Int t=0; t::Put( Matrix& Z, Int i, Int j ) DATA_PUT_TAG, g.VCComm (), matrices_[matrix_index].requests_[destination][dindex]); + // send coordinates const Int cindex = - NextIndex (destination, - 2, + NextIndexCoord (i, j, + destination, XBuffer, - &matrix_index); - // send coordinates - Int *coord = reinterpret_cast(matrices_[matrix_index].data_[destination][cindex].data ()); + &coord_index); + + Int *coord = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data ()); coord[0] = i; coord[1] = j; mpi::TaggedISend (coord, 2, destination, COORD_IJ_TAG, g.VCComm (), - matrices_[matrix_index].requests_[destination][cindex]); + coords_[coord_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -304,7 +413,7 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) std::vector recvVector_; - Int matrix_index; + Int matrix_index, coord_index; T* XBuffer = Z.Buffer(); // Send out the requests to all processes in the grid @@ -312,27 +421,29 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { // we just use the request objects for progress const Int dindex = - NextIndex (rank, - 1, - XBuffer, - &matrix_index); + NextIndexMatrix (rank, + 1, + XBuffer, + &matrix_index); + DEBUG_ONLY (if (Int (matrices_[matrix_index].data_[rank][dindex].size ()) != + 1) LogicError ("Error in NextIndexMatrix");) // send request T *requestBuffer = matrices_[matrix_index].data_[rank][dindex].data(); mpi::TaggedISend (requestBuffer, 1, rank, REQUEST_GET_TAG, g.VCComm(), matrices_[matrix_index].requests_[rank][dindex]); - + + // send coordinates const Int cindex = - NextIndex (rank, - 2, + NextIndexCoord (i, j, + rank, XBuffer, - &matrix_index); - // send coordinates - Int *coord = reinterpret_cast(matrices_[matrix_index].data_[rank][cindex].data ()); + &coord_index); + Int *coord = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data ()); coord[0] = i; coord[1] = j; mpi::TaggedISend (coord, 2, rank, COORD_IJ_TAG, g.VCComm (), - matrices_[matrix_index].requests_[rank][cindex]); + coords_[coord_index].requests_[rank][cindex]); } // Receive all of the replies @@ -405,7 +516,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - Int matrix_index; + Int matrix_index, coord_index; const Int XLDim = Z.LDim(); // local matrix width and height @@ -430,15 +541,17 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); + + // send data const Int dindex = - NextIndex (destination, + NextIndexMatrix (destination, numEntries, XBuffer, &matrix_index); DEBUG_ONLY (if (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndex");) + numEntries) LogicError ("Error in NextIndexMatrix");) T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; } - // send data mpi::TaggedISSend (sendBuffer, numEntries, destination, DATA_ACC_TAG, g.VCComm(), matrices_[matrix_index].requests_[destination][dindex]); + // send coordinates const Int cindex = - NextIndex (destination, - 2, + NextIndexCoord (i, j, + destination, XBuffer, - &matrix_index); - // send coordinates - Int *coord = reinterpret_cast(matrices_[matrix_index].data_[destination][cindex].data()); + &coord_index); + + Int *coord = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); coord[0] = i; coord[1] = j; mpi::TaggedISend (coord, 2, destination, COORD_IJ_TAG, g.VCComm(), - matrices_[matrix_index].requests_[destination][cindex]); + coords_[coord_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -479,7 +592,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) // progress communication for a particular matrix // progress requests template -bool AxpyInterface2::TestRequests ( Matrix& Z ) +bool AxpyInterface2::TestMatrix ( Matrix& Z ) { DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); @@ -518,6 +631,46 @@ bool AxpyInterface2::TestRequests ( Matrix& Z ) return true; } +template +bool AxpyInterface2::TestCoord ( Matrix& Z ) +{ + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + const Int numCoords = coords_.size(); + Int coordIndex; + const T *base_address = Z.LockedBuffer(); + + // search for coord base + for (Int m = 0; m < numCoords; m++) + { + if ( coords_[m].base_ == base_address ) + { + coordIndex = m; + break; + } + coordIndex = m+1; + } + + // coord not found + if ( coordIndex == numCoords) + return true; + + for (int rank = 0; rank < p; ++rank) + { + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + const Int numStatuses = coords_[coordIndex].requests_[rank].size (); + for (int i = 0; i < numStatuses; i++) + { + coords_[coordIndex].statuses_[rank][i] = !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + if ( coords_[coordIndex].statuses_[rank][i] ) + return false; + } + } + return true; +} + // flush ensures local and remote completion // this interface assumes a send has been issued // and will post a matching receive and progress @@ -571,7 +724,7 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) { // check if all sends (data or request) are // complete for a particular matrix - if ( TestRequests( Z ) ) + if ( TestMatrix( Z ) && TestCoord( Z ) ) { mpi::IBarrier ( g.VCComm(), nb_bar_request ); nb_bar_active = true; @@ -760,14 +913,14 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) T* XBuffer = Z.Buffer(); const Int index = - NextIndex (source, + NextIndexMatrix (source, numEntries, XBuffer, &matrix_index); DEBUG_ONLY (if (Int (matrices_[matrix_index].data_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) + numEntries) LogicError ("Error in NextIndexMatrix");) T *replyBuffer = matrices_[matrix_index].data_[source][index].data (); for (Int t = 0; t < localWidth; ++t) @@ -810,6 +963,7 @@ void AxpyInterface2::Detach() GlobalArrayGet_ = 0; matrices_.clear(); + coords_.clear(); } template class AxpyInterface2; From 94f8f82d152d2280a260ee0571919d14867dfb3f Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 18 Aug 2014 16:05:45 -0500 Subject: [PATCH 076/110] tweaking rma --- include/El/core/imports/mpi.hpp | 10 ++++++++++ src/core/RmaInterface.cpp | 18 +++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 690833e27d..33f24aaf37 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -48,6 +48,16 @@ namespace mpi { #define EL_USE_DERIVED_DATATYPE #endif +// explicit progress for RMA +//#ifndef EL_EXPLICIT_PROGRESS +//#define EL_EXPLICIT_PROGRESS +//#endif + +// no acc ordering +//#ifndef EL_NO_ACC_ORDERING +//#define EL_NO_ACC_ORDERING +//#endif + #ifndef EL_INT_SAFE_CAST #define EL_INT_SAFE_CAST(x) \ (x < std::numeric_limits::max () && \ diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index fffe820470..3e50f5e567 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -276,8 +276,8 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) destination, disp, localHeight, window); } // local flush, okay to clear buffers after this - mpi::FlushLocal (destination, window); - putStatus_[destination][index] = false; + //mpi::FlushLocal (destination, window); + //putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -354,8 +354,8 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) destination, disp, localHeight, window); } // local flush, okay to clear buffers after this - mpi::FlushLocal (destination, window); - putStatus_[destination][index] = false; + //mpi::FlushLocal (destination, window); + //putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -430,7 +430,7 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) // no difference between localflush // and flush for Get mpi::FlushLocal (destination, window); - getStatus_[destination][index] = false; + //getStatus_[destination][index] = false; // update local matrix for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) destination, disp, localHeight, window); } // local flush, okay to clear buffers after this - mpi::FlushLocal (destination, window); - putStatus_[destination][index] = false; + //mpi::FlushLocal (destination, window); + //putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -596,8 +596,8 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) destination, disp, localHeight, window); } // local flush, okay to clear buffers after this - mpi::FlushLocal (destination, window); - putStatus_[destination][index] = false; + //mpi::FlushLocal (destination, window); + //putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) From f5dba360959417278abdf96315dd6f33a6fd17bb Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 18 Aug 2014 17:44:23 -0500 Subject: [PATCH 077/110] removing functionality to store put/acc statuses...will use flush always --- include/El/core/RmaInterface.hpp | 7 +- src/core/RmaInterface.cpp | 215 ++++++++++++++++++------------- 2 files changed, 126 insertions(+), 96 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index e43448af52..9a2be10b55 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -37,7 +37,6 @@ class RmaInterface void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); - void LocalAcc( Matrix& Z, Int i, Int j ); void Flush( Matrix& Z, Int i, Int j ); void Flush( const Matrix& Z, Int i, Int j ); @@ -52,9 +51,6 @@ class RmaInterface std::vector>> getVector_, putVector_; - std::vector> - putStatus_, getStatus_; - DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; @@ -62,8 +58,7 @@ class RmaInterface attached_, detached_; Int NextIndex ( Int dataSize, - std::deque > &dataVectors, - std::deque &statuses); + std::deque > &dataVectors ); }; #endif //MPI-3 } // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 3e50f5e567..d20e212c2d 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -50,8 +50,6 @@ RmaInterface::RmaInterface( DistMatrix& Z ) putVector_.resize( p ); getVector_.resize( p ); - putStatus_.resize( p ); - getStatus_.resize( p ); } template @@ -72,8 +70,6 @@ RmaInterface::RmaInterface( const DistMatrix& X ) getVector_.resize( p ); putVector_.resize( p ); - putStatus_.resize( p ); - getStatus_.resize( p ); } template @@ -126,9 +122,6 @@ void RmaInterface::Attach( DistMatrix& Z ) { getVector_.resize( p ); putVector_.resize( p ); - - putStatus_.resize( p ); - getStatus_.resize( p ); } // do rma related checks @@ -167,9 +160,6 @@ void RmaInterface::Attach( const DistMatrix& X ) { getVector_.resize( p ); putVector_.resize( p ); - - putStatus_.resize( p ); - getStatus_.resize( p ); } //do rma related checks @@ -185,26 +175,13 @@ void RmaInterface::Attach( const DistMatrix& X ) template Int RmaInterface::NextIndex ( Int dataSize, - std::deque > &dataVectors, - std::deque &statuses ) + std::deque > &dataVectors) { DEBUG_ONLY (CallStackEntry cse ("RmaInterface::NextIndex")) const Int numCreated = dataVectors.size (); - DEBUG_ONLY (if (numCreated != Int (statuses.size ())) - ("size mismatch");) - // check if we may reuse an existing index - for (Int i = 0; i < numCreated; ++i) - { - if (!statuses[i]) - { - statuses[i] = true; - dataVectors[i].resize (dataSize); - return i; - } - } + dataVectors.resize (numCreated + 1); dataVectors[numCreated].resize (dataSize); - statuses.push_back (true); return numCreated; } @@ -259,7 +236,7 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination], putStatus_[destination] ); + putVector_[destination]); T* sendBuffer = putVector_[destination][index].data(); T* XBuffer = Z.Buffer(); @@ -275,9 +252,6 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - // local flush, okay to clear buffers after this - //mpi::FlushLocal (destination, window); - //putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -338,7 +312,7 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination], putStatus_[destination] ); + putVector_[destination]); T* sendBuffer = putVector_[destination][index].data(); const T* XBuffer = Z.LockedBuffer(); @@ -353,9 +327,6 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - // local flush, okay to clear buffers after this - //mpi::FlushLocal (destination, window); - //putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -417,7 +388,7 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - getVector_[destination], getStatus_[destination]); + getVector_[destination]); T *getBuffer = getVector_[destination][index].data (); // get @@ -430,7 +401,6 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) // no difference between localflush // and flush for Get mpi::FlushLocal (destination, window); - //getStatus_[destination][index] = false; // update local matrix for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination], putStatus_[destination] ); + putVector_[destination]); T* sendBuffer = putVector_[destination][index].data(); T* XBuffer = Z.Buffer(); @@ -515,15 +485,11 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - // local flush, okay to clear buffers after this - //mpi::FlushLocal (destination, window); - //putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - #ifdef EL_EXPLICIT_PROGRESS RmaProgress (g.VCComm ()); #endif @@ -580,7 +546,7 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination], putStatus_[destination]); + putVector_[destination]); T* sendBuffer = putVector_[destination][index].data(); const T* XBuffer = Z.LockedBuffer(); @@ -595,15 +561,11 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - // local flush, okay to clear buffers after this - //mpi::FlushLocal (destination, window); - //putStatus_[destination][index] = false; } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - #ifdef EL_EXPLICIT_PROGRESS RmaProgress (g.VCComm ()); #endif @@ -620,19 +582,38 @@ void RmaInterface::Flush( Matrix& Z, Int i, Int j ) //do rma related checks const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; for( Int step=0; step::Flush( const Matrix& Z, Int i, Int j ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - DistMatrix& Y = *GlobalArrayPut_; + const DistMatrix& Y = *GlobalArrayGet_; //do rma related checks const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; for( Int step=0; step void RmaInterface::Flush( Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); DistMatrix& Y = *GlobalArrayPut_; - + + //do rma related checks const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + // i = j = 0 - leftmost coordinates of DistMatrix + const Int colAlign = Y.ColAlign() % r; + const Int rowAlign = Y.RowAlign() % c; - // flush all - mpi::Flush ( window ); - - // clear statuses + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; for( Int step=0; step::Flush( const Matrix& Z ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - DistMatrix& Y = *GlobalArrayPut_; + // rma checks, see if Z is not NULL, etc + const DistMatrix& Y = *GlobalArrayGet_; + + //do rma related checks const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + // i = j = 0 - leftmost coordinates of DistMatrix + const Int colAlign = Y.ColAlign() % r; + const Int rowAlign = Y.RowAlign() % c; - // flush all - mpi::Flush ( window ); - - // clear statuses + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + // find destination + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; for( Int step=0; step::Detach() putVector_.clear(); getVector_.clear(); - putStatus_.clear(); - getStatus_.clear(); - mpi::WindowUnlock (window); mpi::WindowFree (window); } From 40de29039e6b91c3a4eeaa771c68750e048428de Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 18 Aug 2014 18:58:35 -0500 Subject: [PATCH 078/110] adding local flush after put-acc --- src/core/RmaInterface.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index d20e212c2d..6ec866cee2 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -252,6 +252,7 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -327,6 +328,7 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -485,6 +487,7 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -561,6 +564,7 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) From 67624b723cedd5aa18dfdb70169111385069d18b Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Fri, 22 Aug 2014 16:46:04 -0500 Subject: [PATCH 079/110] remove flush local from put/acc --- src/core/RmaInterface.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 6ec866cee2..4c850789bd 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -252,7 +252,6 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -328,7 +327,6 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -487,7 +485,6 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -564,7 +561,6 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -676,7 +672,7 @@ void RmaInterface::Flush( Matrix& Z ) LogicError("Must initiate transfer before flushing."); DistMatrix& Y = *GlobalArrayPut_; - + //do rma related checks const Grid& g = Y.Grid(); const Int r = g.Height(); From e4e7218a4c55a076b5a6cf9f52ea4a5782600009 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 28 Aug 2014 10:50:59 -0500 Subject: [PATCH 080/110] add a barrier after nbc stuff in original axpyinterface; replace issend by isend in axpy2, and remove a one byte send for data request in get, instead we send coordinates and use data request tag instead of coord tag; add a note on progress in rmainterface --- src/core/AxpyInterface.cpp | 4 +++- src/core/AxpyInterface2.0.cpp | 39 ++++++++--------------------------- src/core/RmaInterface.cpp | 2 ++ 3 files changed, 14 insertions(+), 31 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 6eb07b99d0..840a3d4d48 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -813,6 +813,7 @@ template < typename T > void AxpyInterface < T >::Detach () } } } + mpi::Barrier (g.VCComm ()); #else while (!Finished ()) { @@ -850,6 +851,7 @@ template < typename T > void AxpyInterface < T >::Detach () } } } + mpi::Barrier (g.VCComm ()); #else while (!Finished ()) { @@ -859,7 +861,7 @@ template < typename T > void AxpyInterface < T >::Detach () mpi::Barrier (g.VCComm ()); #endif } - + attachedForLocalToGlobal_ = false; attachedForGlobalToLocal_ = false; recvVector_.clear(); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 86a4ed8558..d98269d8a5 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -8,7 +8,6 @@ which can be found in the LICENSE file in the root directory, or at // TODO Use DDT for put/get/acc when EL_USE_DERIVED_TYPE is defined // TODO bring back const interfaces -// TODO localflush namespace El { template @@ -367,7 +366,7 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; } // put request - mpi::TaggedISSend (sendBuffer, numEntries, destination, + mpi::TaggedISend (sendBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm (), matrices_[matrix_index].requests_[destination][dindex]); @@ -412,28 +411,14 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) const Int p = g.Size (); std::vector recvVector_; - - Int matrix_index, coord_index; + Int coord_index; T* XBuffer = Z.Buffer(); // Send out the requests to all processes in the grid for (Int rank = 0; rank < p; ++rank) { - // we just use the request objects for progress - const Int dindex = - NextIndexMatrix (rank, - 1, - XBuffer, - &matrix_index); - DEBUG_ONLY (if (Int (matrices_[matrix_index].data_[rank][dindex].size ()) != - 1) LogicError ("Error in NextIndexMatrix");) - // send request - T *requestBuffer = matrices_[matrix_index].data_[rank][dindex].data(); - mpi::TaggedISend (requestBuffer, 1, rank, - REQUEST_GET_TAG, g.VCComm(), - matrices_[matrix_index].requests_[rank][dindex]); - - // send coordinates + // send coordinates, no need to send a separate + // request object const Int cindex = NextIndexCoord (i, j, rank, @@ -442,7 +427,7 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) Int *coord = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data ()); coord[0] = i; coord[1] = j; mpi::TaggedISend (coord, 2, rank, - COORD_IJ_TAG, g.VCComm (), + REQUEST_GET_TAG, g.VCComm (), coords_[coord_index].requests_[rank][cindex]); } @@ -562,7 +547,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; } - mpi::TaggedISSend (sendBuffer, numEntries, destination, + mpi::TaggedISend (sendBuffer, numEntries, destination, DATA_ACC_TAG, g.VCComm(), matrices_[matrix_index].requests_[destination][dindex]); @@ -877,17 +862,11 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) { const Int source = status.MPI_SOURCE; - // dummy var for receiving request - // we don't use this anyway - T dummy_[1]; - // post receive request for get - mpi::TaggedRecv (dummy_, 1, source, - REQUEST_GET_TAG, g.VCComm()); - + // post receive for coordinates Int coord[2] = {-1, -1}; mpi::TaggedRecv (coord, 2, source, - COORD_IJ_TAG, g.VCComm()); + REQUEST_GET_TAG, g.VCComm()); Int i = coord[0]; Int j = coord[1]; const Int colAlign = (Y.ColAlign() + i) % r; @@ -931,7 +910,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) } // Fire off non-blocking send - mpi::TaggedISSend (replyBuffer, numEntries, source, + mpi::TaggedISend (replyBuffer, numEntries, source, DATA_GET_TAG, g.VCComm (), matrices_[matrix_index].requests_[source][index]); } diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 4c850789bd..718acd3867 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -257,6 +257,8 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } +// Note: This as of now does not progress +// RMA routines #ifdef EL_EXPLICIT_PROGRESS RmaProgress (g.VCComm ()); #endif From 6b4a6e2850a39445b967f6aaa1b5d075bb500216 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 2 Sep 2014 11:41:33 -0500 Subject: [PATCH 081/110] axpy2: adding put/get/acc functions which are blocking, the nb ones are Iget, Iput, Iacc etc which requires a flush to complete communication; rma: removing flush i,j,and modify implementation so that flush is nothing but a flush-all --- include/El/core/AxpyInterface2.0.hpp | 25 +- include/El/core/imports/mpi.hpp | 1 - src/core/AxpyInterface2.0.cpp | 443 +++++++++++++++++++++++++-- src/core/RmaInterface.cpp | 83 +---- 4 files changed, 442 insertions(+), 110 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index a71cb57f71..a06d30a7ac 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -21,6 +21,19 @@ class AxpyInterface2 void Attach( DistMatrix& Z ); void Attach( const DistMatrix& Z ); + // nonblocking update routines + void Iput( Matrix& Z, Int i, Int j ); + void Iput( const Matrix& Z, Int i, Int j ); + + void Iget( Matrix& Z, Int i, Int j ); + + void Iacc( Matrix& Z, Int i, Int j ); + void Iacc( const Matrix& Z, Int i, Int j ); + + void Flush( Matrix& Z ); + void Flush( const Matrix& Z ); + + // blocking update routines void Put( Matrix& Z, Int i, Int j ); void Put( const Matrix& Z, Int i, Int j ); @@ -29,14 +42,6 @@ class AxpyInterface2 void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); - void Flush( Matrix& Z, Int i, Int j ); - void Flush( const Matrix& Z, Int i, Int j ); - void Flush( Matrix& Z ); - void Flush( const Matrix& Z ); - - void LocalFlush( Matrix& Z, Int i, Int j ); - void LocalFlush( const Matrix& Z, Int i, Int j ); - void Detach(); private: @@ -113,7 +118,9 @@ class AxpyInterface2 bool TestCoord ( Matrix& Z ); bool TestCoord ( const Matrix& Z ); - + + // these are only used for nonblocking + // update rountines void HandleGlobalToLocalData( Matrix& Z ); void HandleLocalToGlobalData( Matrix& Z, Int count, Int source ); void HandleLocalToGlobalAcc( Matrix& Z, Int count, Int source ); diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 33f24aaf37..689444bbeb 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -355,7 +355,6 @@ bool Testany( int count, Request* requests ); bool Testany( int count, Request* requests, int& indx ); bool Testany( int count, Request* requests, int& indx, Status& status ); bool IProbe( int source, int tag, Comm comm, Status& status ); -void RequestFree( Request& request ); template int GetCount( Status& status ); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index d98269d8a5..78a269153d 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -298,6 +298,7 @@ Int AxpyInterface2::NextIndexCoord ( return numCreated; } +// blocking update routines template void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) { @@ -334,6 +335,421 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) Int matrix_index, coord_index; + for( Int step=0; step getVector_; + mpi::Status status; + + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + + while (!DONE) + { + if ( ( mpi::IProbe(mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm (), status ) ) ) + { + const Int source = status.MPI_SOURCE; + const Int count = mpi::GetCount (status); + getVector_.resize ( count ); + // post receive for data + T *getBuffer = getVector_.data(); + + mpi::TaggedRecv (getBuffer, count, source, + DATA_PUT_TAG, g.VCComm()); + + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] = XCol[s]; + } + } + // progress my sends + if ( nb_bar_active ) + { + DONE = mpi::Test ( nb_bar_request ); + } + else + { + // check if all nb sends are complete + if ( TestMatrix( Z ) ) + { + mpi::IBarrier ( g.VCComm(), nb_bar_request ); + nb_bar_active = true; + } + } + } + + getVector_.clear(); +} + +template +void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + DistMatrix& X = *GlobalArrayGet_; + + const Int height = Z.Height (); + const Int width = Z.Width (); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid AxpyGlobalToLocal submatrix"); + + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + + std::vector recvVector_; + Int matrix_index; + + const Int XLDim = Z.LDim(); + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + + Int receivingRow = g.Row(); + Int receivingCol = g.Col(); + + for( Int step=0; step (status); + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + + // Compute the local heights and offsets + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = X.Buffer (0, rowShift + t * c); + const T *XCol = &recvBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] = XCol[s]; + } + } + // progress my sends + if ( nb_bar_active ) + { + DONE = mpi::Test ( nb_bar_request ); + } + else + { + // check if all nb sends are complete + if ( TestMatrix( Z ) ) + { + mpi::IBarrier ( g.VCComm(), nb_bar_request ); + nb_bar_active = true; + } + } + } + + recvVector_.clear(); +} + +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width +template +void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + Int matrix_index, coord_index; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim(); + + for( Int step=0; step getVector_; + mpi::Status status; + + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + + while (!DONE) + { + if ( ( mpi::IProbe(mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm (), status ) ) ) + { + const Int source = status.MPI_SOURCE; + const Int count = mpi::GetCount (status); + getVector_.resize ( count ); + // post receive for data + T *getBuffer = getVector_.data(); + + mpi::TaggedRecv (getBuffer, count, source, + DATA_ACC_TAG, g.VCComm()); + + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += XCol[s]; + } + } + // progress my sends + if ( nb_bar_active ) + { + DONE = mpi::Test ( nb_bar_request ); + } + else + { + // check if all nb sends are complete + if ( TestMatrix( Z ) ) + { + mpi::IBarrier ( g.VCComm(), nb_bar_request ); + nb_bar_active = true; + } + } + } + + getVector_.clear(); +} + +// nonblocking update routines +template +void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iput")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim (); + + Int matrix_index, coord_index; + for( Int step=0; step::Put( Matrix& Z, Int i, Int j ) } template -void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iget")) // a call to Attach with a non-const DistMatrix must set // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached @@ -477,9 +893,9 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template -void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iacc")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -570,12 +986,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) } } -// free requests - this is necessary for freeing -// requests of ISends after successful Recvs, -// because - -// progress communication for a particular matrix -// progress requests +// progress communications for a particular matrix template bool AxpyInterface2::TestMatrix ( Matrix& Z ) { @@ -657,10 +1068,8 @@ bool AxpyInterface2::TestCoord ( Matrix& Z ) } // flush ensures local and remote completion -// this interface assumes a send has been issued -// and will post a matching receive and progress template -void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Flush( Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) @@ -718,14 +1127,6 @@ void AxpyInterface2::Flush( Matrix& Z, Int i, Int j ) } } -// all communications pertaining to matrix Z -template -void AxpyInterface2::Flush( Matrix& Z ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) - Flush ( Z, 0, 0 ); -} - template < typename T > void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int source ) { diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 718acd3867..de8dd2bd28 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -673,44 +673,7 @@ void RmaInterface::Flush( Matrix& Z ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - DistMatrix& Y = *GlobalArrayPut_; - - //do rma related checks - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - // i = j = 0 - leftmost coordinates of DistMatrix - const Int colAlign = Y.ColAlign() % r; - const Int rowAlign = Y.RowAlign() % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step @@ -718,48 +681,10 @@ void RmaInterface::Flush( const Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); - // rma checks, see if Z is not NULL, etc - const DistMatrix& Y = *GlobalArrayGet_; - - //do rma related checks - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - // i = j = 0 - leftmost coordinates of DistMatrix - const Int colAlign = Y.ColAlign() % r; - const Int rowAlign = Y.RowAlign() % c; - - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step From 55e4d4b13c76af45bfc6e091439bbb1e55179548 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 10 Sep 2014 13:25:21 -0500 Subject: [PATCH 082/110] added axpy blocking interface, added waitall in mpi...blocking functions returns incorrect results due to a synch issue, working on it --- include/El/core/AxpyInterface2.0.hpp | 7 +- include/El/core/imports/mpi.hpp | 4 +- src/core/AxpyInterface2.0.cpp | 1267 +++++++++++++------------- src/core/imports/mpi.cpp | 13 +- 4 files changed, 636 insertions(+), 655 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index a06d30a7ac..576b1e1cda 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -22,6 +22,7 @@ class AxpyInterface2 void Attach( const DistMatrix& Z ); // nonblocking update routines + // requires flush for completion void Iput( Matrix& Z, Int i, Int j ); void Iput( const Matrix& Z, Int i, Int j ); @@ -81,7 +82,7 @@ class AxpyInterface2 std::vector coords_; - // need to add const here... + // TODO need to add const here... DistMatrix* GlobalArrayPut_; DistMatrix* GlobalArrayGet_; @@ -119,6 +120,10 @@ class AxpyInterface2 bool TestCoord ( Matrix& Z ); bool TestCoord ( const Matrix& Z ); + /* Wait */ + void WaitMatrix ( Matrix& Z ); + void WaitMatrix ( const Matrix& Z ); + // these are only used for nonblocking // update rountines void HandleGlobalToLocalData( Matrix& Z ); diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 689444bbeb..af09ad71a8 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -343,10 +343,12 @@ void Barrier( Comm comm ); #if MPI_VERSION>=3 void IBarrier( Comm comm, Request& request ); #endif +void RequestFree( Request& request ); void Wait( Request& request ); void Wait( Request& request, Status& status ); //TODO add another function for getting statuses -void WaitAny( int count, Request *requests, int *indx ); +void WaitAny (int numRequests, Request * requests, Int * index); + void WaitAll( int numRequests, Request* requests ); void WaitAll( int numRequests, Request* requests, Status* statuses ); bool Test( Request& request ); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 78a269153d..de69f0d7c6 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -298,9 +298,8 @@ Int AxpyInterface2::NextIndexCoord ( return numCreated; } -// blocking update routines template -void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) @@ -349,7 +348,7 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; T* XBuffer = Z.Buffer(); const Int dindex = - NextIndexMatrix (destination, + NextIndexMatrix (destination, numEntries, XBuffer, &matrix_index); @@ -370,71 +369,27 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) mpi::TaggedISend (sendBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm (), matrices_[matrix_index].requests_[destination][dindex]); + + // send coordinates + const Int cindex = + NextIndexCoord (i, j, + destination, + XBuffer, + &coord_index); + + Int *coord = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data ()); + coord[0] = i; coord[1] = j; + mpi::TaggedISend (coord, 2, destination, COORD_IJ_TAG, g.VCComm (), + coords_[coord_index].requests_[destination][cindex]); } - - receivingRow = (receivingRow + 1) % r; + receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - - // my receives - // number of entries in my PE - const Int iLocalOffset = Length (i, Y.ColShift(), r); - const Int jLocalOffset = Length (j, Y.RowShift(), c); - const Int localHeight = Length( height, Y.ColShift(), r ); - const Int localWidth = Length( width, Y.RowShift(), c ); - - std::vector getVector_; - mpi::Status status; - - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; - - while (!DONE) - { - if ( ( mpi::IProbe(mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm (), status ) ) ) - { - const Int source = status.MPI_SOURCE; - const Int count = mpi::GetCount (status); - getVector_.resize ( count ); - // post receive for data - T *getBuffer = getVector_.data(); - - mpi::TaggedRecv (getBuffer, count, source, - DATA_PUT_TAG, g.VCComm()); - - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); - - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] = XCol[s]; - } - } - // progress my sends - if ( nb_bar_active ) - { - DONE = mpi::Test ( nb_bar_request ); - } - else - { - // check if all nb sends are complete - if ( TestMatrix( Z ) ) - { - mpi::IBarrier ( g.VCComm(), nb_bar_request ); - nb_bar_active = true; - } - } - } - - getVector_.clear(); } template -void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) // a call to Attach with a non-const DistMatrix must set @@ -456,74 +411,32 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) const Int p = g.Size (); std::vector recvVector_; - Int matrix_index; - - const Int XLDim = Z.LDim(); - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); + Int coord_index; - Int receivingRow = g.Row(); - Int receivingCol = g.Col(); - - for( Int step=0; step(coords_[coord_index].coord_[rank][cindex].data ()); + coord[0] = i; coord[1] = j; + mpi::TaggedISend (coord, 2, rank, + REQUEST_GET_TAG, g.VCComm (), + coords_[coord_index].requests_[rank][cindex]); } // Receive all of the replies - mpi::Status status; - - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; - - while ( !DONE ) + Int numReplies = 0; + while (numReplies < p) { + mpi::Status status; + HandleGlobalToLocalData ( Z ); if (mpi::IProbe (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { @@ -555,30 +468,16 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) for (Int s = 0; s < localHeight; ++s) YCol[colShift + s * r] = XCol[s]; } - } - // progress my sends - if ( nb_bar_active ) - { - DONE = mpi::Test ( nb_bar_request ); - } - else - { - // check if all nb sends are complete - if ( TestMatrix( Z ) ) - { - mpi::IBarrier ( g.VCComm(), nb_bar_request ); - nb_bar_active = true; - } + ++numReplies; + recvVector_.clear(); } } - - recvVector_.clear(); } // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template -void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) @@ -651,251 +550,369 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) mpi::TaggedISend (sendBuffer, numEntries, destination, DATA_ACC_TAG, g.VCComm(), matrices_[matrix_index].requests_[destination][dindex]); + + // send coordinates + const Int cindex = + NextIndexCoord (i, j, + destination, + XBuffer, + &coord_index); + + Int *coord = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); + coord[0] = i; coord[1] = j; + mpi::TaggedISend (coord, 2, destination, + COORD_IJ_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex]); } - - receivingRow = (receivingRow + 1) % r; + receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - // my receives - // number of entries in my PE - const Int iLocalOffset = Length (i, Y.ColShift(), r); - const Int jLocalOffset = Length (j, Y.RowShift(), c); - const Int localHeight = Length( height, Y.ColShift(), r ); - const Int localWidth = Length( width, Y.RowShift(), c ); +} - std::vector getVector_; - mpi::Status status; - - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; +// free requests - this is necessary for freeing +// requests of ISends after successful Recvs, +// because + +// progress communication for a particular matrix +// progress requests +template +bool AxpyInterface2::TestMatrix ( Matrix& Z ) +{ + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + const Int numMatrices = matrices_.size(); + Int matrixIndex; + const T *base_address = Z.LockedBuffer(); - while (!DONE) + // search for matrix base + for (Int m = 0; m < numMatrices; m++) { - if ( ( mpi::IProbe(mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm (), status ) ) ) + if ( matrices_[m].base_ == base_address ) { - const Int source = status.MPI_SOURCE; - const Int count = mpi::GetCount (status); - getVector_.resize ( count ); - // post receive for data - T *getBuffer = getVector_.data(); - - mpi::TaggedRecv (getBuffer, count, source, - DATA_ACC_TAG, g.VCComm()); + matrixIndex = m; + break; + } + matrixIndex = m+1; + } - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + // matrix not found + if ( matrixIndex == numMatrices) + return true; - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += XCol[s]; - } - } - // progress my sends - if ( nb_bar_active ) - { - DONE = mpi::Test ( nb_bar_request ); - } - else + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + const Int numStatuses = matrices_[matrixIndex].requests_[rank].size (); + for (int i = 0; i < numStatuses; i++) { - // check if all nb sends are complete - if ( TestMatrix( Z ) ) - { - mpi::IBarrier ( g.VCComm(), nb_bar_request ); - nb_bar_active = true; - } + matrices_[matrixIndex].statuses_[rank][i] = !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + if ( matrices_[matrixIndex].statuses_[rank][i] ) + return false; } } - - getVector_.clear(); + return true; } -// nonblocking update routines template -void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) +bool AxpyInterface2::TestCoord ( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iput")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + const Int numCoords = coords_.size(); + Int coordIndex; + const T *base_address = Z.LockedBuffer(); - DistMatrix& Y = *GlobalArrayPut_; - //do boundary checks - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + // search for coord base + for (Int m = 0; m < numCoords; m++) + { + if ( coords_[m].base_ == base_address ) + { + coordIndex = m; + break; + } + coordIndex = m+1; + } - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; + // coord not found + if ( coordIndex == numCoords) + return true; - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + for (int rank = 0; rank < p; ++rank) + { + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + const Int numStatuses = coords_[coordIndex].requests_[rank].size (); + for (int i = 0; i < numStatuses; i++) + { + coords_[coordIndex].statuses_[rank][i] = !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + if ( coords_[coordIndex].statuses_[rank][i] ) + return false; + } + } + return true; +} - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; +// flush ensures local and remote completion +// this interface assumes a send has been issued +// and will post a matching receive and progress +template +void AxpyInterface2::Flush( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); - const Int YLDim = Y.LDim (); + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + + mpi::Status status; - Int matrix_index, coord_index; + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; - for( Int step=0; step (status); + HandleLocalToGlobalData ( Z, count, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + const Int count = mpi::GetCount (status); + HandleLocalToGlobalAcc ( Z, count, status.MPI_SOURCE ); + break; + } + case REQUEST_GET_TAG: + { + HandleGlobalToLocalData ( Z ); + break; + } + } + } + if ( nb_bar_active ) + { + DONE = mpi::Test ( nb_bar_request ); + } + else + { + // check if all sends (data or request) are + // complete for a particular matrix + if ( TestMatrix( Z ) && TestCoord( Z ) ) + { + mpi::IBarrier ( g.VCComm(), nb_bar_request ); + nb_bar_active = true; + } + } + } +} - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexMatrix");) +template < typename T > +void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int source ) +{ + DistMatrix &Y = *GlobalArrayPut_; + const Grid & g = Y.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + int height = Z.Height(); + int width = Z.Width(); + // data vector + std::vector getVector_; + getVector_.resize (count); - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data ()); - coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, destination, COORD_IJ_TAG, g.VCComm (), - coords_[coord_index].requests_[destination][cindex]); - } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + // post receive for coordinates + Int coord[2]; + mpi::TaggedRecv (coord, 2, source, + COORD_IJ_TAG, g.VCComm()); + Int i = coord[0]; + Int j = coord[1]; + + // post receive for data + T *getBuffer = getVector_.data(); + mpi::TaggedRecv (getBuffer, count, source, + DATA_PUT_TAG, g.VCComm()); + + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, Y.ColShift(), r); + const Int jLocalOffset = Length (j, Y.RowShift(), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] = XCol[s]; } + // Free the memory + getVector_.clear(); } -template -void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) +// replica of above function except this accumulates +template < typename T > +void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int source ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iget")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - DistMatrix& X = *GlobalArrayGet_; + DistMatrix &Y = *GlobalArrayPut_; + const Grid & g = Y.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const int height = Z.Height(); + const int width = Z.Width(); - const Int height = Z.Height (); - const Int width = Z.Width (); + // data buffer + std::vector getVector_; + getVector_.resize (count); + + DEBUG_ONLY (if (count < Int (sizeof (T))) + LogicError ("Count was too small");) + + DEBUG_ONLY (if (Int (getVector_.size ()) != count) + LogicError ("Not enough space allocated");) - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid AxpyGlobalToLocal submatrix"); + // post receive for coordinates + Int coord[2]; + mpi::TaggedRecv (coord, 2, source, + COORD_IJ_TAG, g.VCComm()); + Int i = coord[0]; Int j = coord[1]; + + // post receive for data + T *getBuffer = getVector_.data(); + mpi::TaggedRecv (getBuffer, count, source, + DATA_ACC_TAG, g.VCComm()); - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); + // Update Y + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); - std::vector recvVector_; - Int coord_index; + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + const Int iLocalOffset = Length (i, Y.ColShift(), r); + const Int jLocalOffset = Length (j, Y.RowShift(), c); - T* XBuffer = Z.Buffer(); - // Send out the requests to all processes in the grid - for (Int rank = 0; rank < p; ++rank) + for (Int t = 0; t < localWidth; ++t) { - // send coordinates, no need to send a separate - // request object - const Int cindex = - NextIndexCoord (i, j, - rank, - XBuffer, - &coord_index); - Int *coord = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data ()); - coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, rank, - REQUEST_GET_TAG, g.VCComm (), - coords_[coord_index].requests_[rank][cindex]); + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += XCol[s]; } + // Free the memory + getVector_.clear(); +} - // Receive all of the replies - Int numReplies = 0; - while (numReplies < p) +// handle request for data, post a matching issend +template < typename T > +void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) +{ + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) + + if ( !toBeAttachedForGet_ ) + LogicError("Local matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); + + Int matrix_index; + + mpi::Status status; + + if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) { - mpi::Status status; - HandleGlobalToLocalData ( Z ); - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount (status); - recvVector_.resize (count); - T *recvBuffer = recvVector_.data (); + const Int source = status.MPI_SOURCE; - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + // post receive for coordinates + Int coord[2] = {-1, -1}; + mpi::TaggedRecv (coord, 2, source, + REQUEST_GET_TAG, g.VCComm()); + Int i = coord[0]; Int j = coord[1]; + + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; - // Compute the local heights and offsets - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = X.Buffer (0, rowShift + t * c); - const T *XCol = &recvBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] = XCol[s]; - } - ++numReplies; - recvVector_.clear(); + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + + const Int numEntries = localHeight * localWidth; + + DEBUG_ONLY (if (numEntries < Int (sizeof (T))) + LogicError ("Count was too small");) + + T* XBuffer = Z.Buffer(); + const Int index = + NextIndexMatrix (source, + numEntries, + XBuffer, + &matrix_index); + + DEBUG_ONLY (if + (Int (matrices_[matrix_index].data_[source][index].size ()) != + numEntries) LogicError ("Error in NextIndexMatrix");) + + T *replyBuffer = matrices_[matrix_index].data_[source][index].data (); + for (Int t = 0; t < localWidth; ++t) + { + T *sendCol = &replyBuffer[t * localHeight]; + const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); + MemCopy (sendCol, XCol, localHeight); } + + // Fire off non-blocking send + mpi::TaggedISend (replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm (), + matrices_[matrix_index].requests_[source][index]); } } -// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, -// where X is height x width +// blocking update routines template -void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iacc")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -903,7 +920,6 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; - //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) LogicError("Submatrix out of bounds of global matrix"); @@ -916,10 +932,11 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) const Int myProcessCol = g.Col(); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - - Int matrix_index, coord_index; - + + Int matrix_index; + const Int XLDim = Z.LDim(); + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); @@ -931,390 +948,340 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) for( Int step=0; step(coords_[coord_index].coord_[destination][cindex].data()); - coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, destination, - COORD_IJ_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); + DATA_PUT_TAG, g.VCComm(), + matrices_[matrix_index].requests_[destination][dindex]); } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; - } -} -// progress communications for a particular matrix -template -bool AxpyInterface2::TestMatrix ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - const Int numMatrices = matrices_.size(); - Int matrixIndex; - const T *base_address = Z.LockedBuffer(); + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } - // search for matrix base - for (Int m = 0; m < numMatrices; m++) + // my receives + while ( !TestMatrix (Z) ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; - } + mpi::Status status; + std::vector getVector; - // matrix not found - if ( matrixIndex == numMatrices) - return true; + if ( mpi::IProbe( mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm (), status ) ) + { + const Int source = status.MPI_SOURCE; + const Int count = mpi::GetCount (status); - for (int rank = 0; rank < p; ++rank) - { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - const Int numStatuses = matrices_[matrixIndex].requests_[rank].size (); - for (int i = 0; i < numStatuses; i++) - { - matrices_[matrixIndex].statuses_[rank][i] = !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); - if ( matrices_[matrixIndex].statuses_[rank][i] ) - return false; - } - } - return true; -} + getVector.resize (count); + T *getBuffer = getVector.data (); -template -bool AxpyInterface2::TestCoord ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - const Int numCoords = coords_.size(); - Int coordIndex; - const T *base_address = Z.LockedBuffer(); + mpi::TaggedRecv (getBuffer, count, source, + DATA_PUT_TAG, g.VCComm()); - // search for coord base - for (Int m = 0; m < numCoords; m++) - { - if ( coords_[m].base_ == base_address ) - { - coordIndex = m; - break; - } - coordIndex = m+1; - } + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); - // coord not found - if ( coordIndex == numCoords) - return true; + const Int colShift = Shift (g.Row(), colAlign, r); + const Int rowShift = Shift (g.Col(), rowAlign, c); - for (int rank = 0; rank < p; ++rank) - { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) - continue; - const Int numStatuses = coords_[coordIndex].requests_[rank].size (); - for (int i = 0; i < numStatuses; i++) - { - coords_[coordIndex].statuses_[rank][i] = !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); - if ( coords_[coordIndex].statuses_[rank][i] ) - return false; + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] = XCol[s]; + } + + getVector.clear(); } } - return true; } -// flush ensures local and remote completion template -void AxpyInterface2::Flush( Matrix& Z ) +void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + DistMatrix& X = *GlobalArrayGet_; - DistMatrix& Y = *GlobalArrayPut_; - const Grid& g = Y.Grid(); + const Int height = Z.Height (); + const Int width = Z.Width (); - mpi::Status status; - - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid AxpyGlobalToLocal submatrix"); - while ( !DONE ) + const Grid & g = X.Grid (); + const Int r = g.Height (); + const Int c = g.Width (); + const Int p = g.Size (); + + Int matrix_index; + + const Int XLDim = Z.LDim(); + const Int colAlign = (X.ColAlign() + i) % r; + const Int rowAlign = (X.RowAlign() + j) % c; + const Int iLocalOffset = Length (i, X.ColShift (), r); + const Int jLocalOffset = Length (j, X.RowShift (), c); + + Int receivingRow = g.Row(); + Int receivingCol = g.Col(); + + for( Int step=0; step (status); - HandleLocalToGlobalData ( Z, count, status.MPI_SOURCE ); - break; - } - case DATA_ACC_TAG: - { - const Int count = mpi::GetCount (status); - HandleLocalToGlobalAcc ( Z, count, status.MPI_SOURCE ); - break; - } - case REQUEST_GET_TAG: - { - HandleGlobalToLocalData ( Z ); - break; - } + T *sendCol = &replyBuffer[t * localHeight]; + const T *XCol = X.LockedBuffer (iLocalOffset, jLocalOffset + t); + MemCopy (sendCol, XCol, localHeight); } + + // Fire off non-blocking send + mpi::TaggedISend (replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm (), + matrices_[matrix_index].requests_[source][index]); } - if ( nb_bar_active ) - { - DONE = mpi::Test ( nb_bar_request ); - } - else + + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } + + // Receive all of the replies + while ( !TestMatrix (Z) ) + { + mpi::Status status; + std::vector recvVector_; + + if (mpi::IProbe + (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { - // check if all sends (data or request) are - // complete for a particular matrix - if ( TestMatrix( Z ) && TestCoord( Z ) ) + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount (status); + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + + // Compute the local heights and offsets + const Int myRow = g.Row (); + const Int myCol = g.Col (); + + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) { - mpi::IBarrier ( g.VCComm(), nb_bar_request ); - nb_bar_active = true; + T *YCol = X.Buffer (0, rowShift + t * c); + const T *XCol = &recvBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] = XCol[s]; } } + recvVector_.clear(); } } -template < typename T > -void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int source ) +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width + template +void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { - DistMatrix &Y = *GlobalArrayPut_; - const Grid & g = Y.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - int height = Z.Height(); - int width = Z.Width(); - // data vector - std::vector getVector_; - getVector_.resize (count); + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) - DEBUG_ONLY (if (count < Int (sizeof (T))) - LogicError ("Count was too small");) - DEBUG_ONLY (if (Int (getVector_.size ()) != count) - LogicError ("Not enough space allocated");) + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); - // post receive for coordinates - Int coord[2]; - mpi::TaggedRecv (coord, 2, source, - COORD_IJ_TAG, g.VCComm()); - Int i = coord[0]; - Int j = coord[1]; - - // post receive for data - T *getBuffer = getVector_.data(); - mpi::TaggedRecv (getBuffer, count, source, - DATA_PUT_TAG, g.VCComm()); - - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); + DistMatrix& Y = *GlobalArrayPut_; - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift(), r); - const Int jLocalOffset = Length (j, Y.RowShift(), c); + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] = XCol[s]; - } - // Free the memory - getVector_.clear(); -} + const Grid& g = Y.Grid(); -// replica of above function except this accumulates -template < typename T > -void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int source ) -{ - DistMatrix &Y = *GlobalArrayPut_; - const Grid & g = Y.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const int height = Z.Height(); - const int width = Z.Width(); + const Int XLDim = Z.LDim(); - // data buffer - std::vector getVector_; - getVector_.resize (count); + const Int height = Z.Height(); + const Int width = Z.Width(); - DEBUG_ONLY (if (count < Int (sizeof (T))) - LogicError ("Count was too small");) + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); - DEBUG_ONLY (if (Int (getVector_.size ()) != count) - LogicError ("Not enough space allocated");) - - // post receive for coordinates - Int coord[2]; - mpi::TaggedRecv (coord, 2, source, - COORD_IJ_TAG, g.VCComm()); - Int i = coord[0]; Int j = coord[1]; - - // post receive for data - T *getBuffer = getVector_.data(); - mpi::TaggedRecv (getBuffer, count, source, - DATA_ACC_TAG, g.VCComm()); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); + + const Int colShift = Shift (myProcessRow, colAlign, r); + const Int rowShift = Shift (myProcessCol, rowAlign, c); const Int localHeight = Length (height, colShift, r); const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift(), r); - const Int jLocalOffset = Length (j, Y.RowShift(), c); - - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += XCol[s]; - } - // Free the memory - getVector_.clear(); -} - -// handle request for data, post a matching issend -template < typename T > -void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) -{ - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) - - if ( !toBeAttachedForGet_ ) - LogicError("Local matrix cannot be updated"); - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myRow = g.Row(); - const Int myCol = g.Col(); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); Int matrix_index; - mpi::Status status; + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; - if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) + const Int YLDim = Y.LDim(); + + for( Int step=0; step getVector; - T* XBuffer = Z.Buffer(); - const Int index = - NextIndexMatrix (source, - numEntries, - XBuffer, - &matrix_index); + // my receives + while ( 1 ) + { + if ( mpi::IProbe( mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm (), status ) ) + { + const Int source = status.MPI_SOURCE; + const Int count = mpi::GetCount (status); - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndexMatrix");) - - T *replyBuffer = matrices_[matrix_index].data_[source][index].data (); - for (Int t = 0; t < localWidth; ++t) - { - T *sendCol = &replyBuffer[t * localHeight]; - const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); - MemCopy (sendCol, XCol, localHeight); + getVector.resize (count); + T *getBuffer = getVector.data (); + + mpi::TaggedRecv (getBuffer, count, source, + DATA_ACC_TAG, g.VCComm()); + + const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[s] += XCol[s]; + } } - // Fire off non-blocking send - mpi::TaggedISend (replyBuffer, numEntries, source, - DATA_GET_TAG, g.VCComm (), - matrices_[matrix_index].requests_[source][index]); + // test if my sends are completed + if ( TestMatrix ( Z ) ) + break; } + + getVector.clear(); } // detach collectively diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 5a4b98c0bf..593cd3ca85 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -1425,9 +1425,8 @@ bool Testany (int count, Request * requests) // Ensure that the request finishes before continuing void Wait (Request & request) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Wait")) Status - status; - SafeMpi (MPI_Wait (&request, &status)); + DEBUG_ONLY (CallStackEntry cse ("mpi::Wait")) + SafeMpi (MPI_Wait (&request, MPI_STATUS_IGNORE)); } // Ensure that the request finishes before continuing @@ -1455,6 +1454,14 @@ void WaitAll (int numRequests, Request * requests, (numRequests, requests, statuses)); } +// Ensure that any requests finish before continuing +void WaitAny (int numRequests, Request * requests, Int * index) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WaitAny")) + SafeMpi (MPI_Waitany + (numRequests, requests, index, MPI_STATUS_IGNORE)); +} + // Nonblocking test for message completion bool IProbe (int source, int tag, Comm comm, Status & status) From af9f3ca13c83a68df709b4a81b52e4a2447fe493 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 17 Sep 2014 02:28:52 -0500 Subject: [PATCH 083/110] added some new functions in mpi, still fixing blocking interfaces...get is completely broken --- include/El/core/AxpyInterface2.0.hpp | 3 +- include/El/core/imports/mpi.hpp | 14 ++ src/core/AxpyInterface2.0.cpp | 239 ++++++++++++++++++++------- src/core/imports/mpi.cpp | 79 ++++++++- 4 files changed, 269 insertions(+), 66 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 576b1e1cda..97526390f5 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -52,7 +52,8 @@ class AxpyInterface2 DATA_GET_TAG =2, DATA_ACC_TAG =3, REQUEST_GET_TAG =4, - COORD_IJ_TAG =5; + COORD_ACC_TAG =5, + COORD_PUT_TAG =6; // struct for passing data struct matrix_params_ diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index af09ad71a8..cdf2ccbc2b 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -102,6 +102,7 @@ typedef MPI_Datatype Datatype; typedef MPI_Errhandler ErrorHandler; typedef MPI_Request Request; typedef MPI_Status Status; +typedef MPI_Message Message; typedef MPI_User_function UserFunction; #if MPI_VERSION >= 3 typedef MPI_Win Window; @@ -357,6 +358,14 @@ bool Testany( int count, Request* requests ); bool Testany( int count, Request* requests, int& indx ); bool Testany( int count, Request* requests, int& indx, Status& status ); bool IProbe( int source, int tag, Comm comm, Status& status ); +bool IProbe( int source, Comm comm, Status& status ); +bool IProbe( Comm comm, Status& status ); +void Probe ( int source, int tag, Comm comm, Status & status ); +void Probe ( int source, Comm comm, Status & status ); +void Probe ( Comm comm, Status & status ); +// matching probe +bool IMprobe( int source, int tag, Comm comm, Status& status, Message& message ); + template int GetCount( Status& status ); @@ -432,6 +441,11 @@ T TaggedRecv( int from, int tag, Comm comm ); // If the recv count is one and the tag is irrelevant template T Recv( int from, Comm comm ); +// matched recv +template +void TaggedMrecv( R* buf, int count, Message & message ); +template +void TaggedMrecv( Complex* buf, int count, Message & message ); // Non-blocking recv // ----------------- diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index de69f0d7c6..2abc46d1ec 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -301,7 +301,7 @@ Int AxpyInterface2::NextIndexCoord ( template void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iput")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -379,7 +379,7 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) Int *coord = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data ()); coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, destination, COORD_IJ_TAG, g.VCComm (), + mpi::TaggedISend (coord, 2, destination, COORD_PUT_TAG, g.VCComm (), coords_[coord_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; @@ -391,7 +391,7 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) template void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iget")) // a call to Attach with a non-const DistMatrix must set // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached @@ -479,7 +479,7 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) template void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iacc")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -561,7 +561,7 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) Int *coord = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); coord[0] = i; coord[1] = j; mpi::TaggedISend (coord, 2, destination, - COORD_IJ_TAG, g.VCComm(), + COORD_ACC_TAG, g.VCComm(), coords_[coord_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; @@ -570,9 +570,43 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) } } -// free requests - this is necessary for freeing -// requests of ISends after successful Recvs, -// because +// wait +template +void AxpyInterface2::WaitMatrix ( Matrix& Z ) +{ + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + const Int numMatrices = matrices_.size(); + Int matrixIndex; + const T *base_address = Z.LockedBuffer(); + + // search for matrix base + for (Int m = 0; m < numMatrices; m++) + { + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + matrixIndex = m+1; + } + // matrix not found + if ( matrixIndex == numMatrices) + return; + + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + const Int numRequests = matrices_[matrixIndex].requests_[rank].size (); + for (int i = 0; i < numRequests; i++) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = false; + } + } +} // progress communication for a particular matrix // progress requests @@ -670,11 +704,10 @@ void AxpyInterface2::Flush( Matrix& Z ) const Grid& g = Y.Grid(); mpi::Status status; - bool DONE = false; mpi::Request nb_bar_request; bool nb_bar_active = false; - + while ( !DONE ) { // similar to HandleXYZ functions in original AxpyInterface @@ -701,6 +734,7 @@ void AxpyInterface2::Flush( Matrix& Z ) } } } + if ( nb_bar_active ) { DONE = mpi::Test ( nb_bar_request ); @@ -741,7 +775,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int s // post receive for coordinates Int coord[2]; mpi::TaggedRecv (coord, 2, source, - COORD_IJ_TAG, g.VCComm()); + COORD_PUT_TAG, g.VCComm()); Int i = coord[0]; Int j = coord[1]; @@ -799,7 +833,7 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so // post receive for coordinates Int coord[2]; mpi::TaggedRecv (coord, 2, source, - COORD_IJ_TAG, g.VCComm()); + COORD_ACC_TAG, g.VCComm()); Int i = coord[0]; Int j = coord[1]; // post receive for data @@ -915,48 +949,54 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError("Submatrix out of bounds of global matrix"); const Grid& g = Y.Grid(); + + const Int XLDim = Z.LDim(); + + const Int height = Z.Height(); + const Int width = Z.Width(); + const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); + const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - Int matrix_index; - - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + Int matrix_index, coord_index; Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; + + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; const Int YLDim = Y.LDim(); - + for( Int step=0; step::Put( Matrix& Z, Int i, Int j ) (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != numEntries) LogicError ("Error in NextIndexMatrix");) - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + for( Int t=0; t::Put( Matrix& Z, Int i, Int j ) mpi::TaggedISend (sendBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm(), matrices_[matrix_index].requests_[destination][dindex]); + + // send coordinates + const Int cindex = + NextIndexCoord (i, j, + destination, + XBuffer, + &coord_index); + + Int *coord = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); + coord[0] = i; coord[1] = j; + mpi::TaggedISend (coord, 2, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; @@ -988,25 +1042,43 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) receivingCol = (receivingCol + 1) % c; } + // progress my sends + TestMatrix ( Z ); + TestCoord ( Z ); + // my receives - while ( !TestMatrix (Z) ) + std::vector getVector; + bool flag = true; + + while ( flag ) { mpi::Status status; - std::vector getVector; - - if ( mpi::IProbe( mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm (), status ) ) - { + flag = mpi::IProbe( mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm (), status); + + if ( flag ) + { const Int source = status.MPI_SOURCE; const Int count = mpi::GetCount (status); + // post receive for coordinates + Int coord[2]; + mpi::TaggedRecv (coord, 2, source, + COORD_PUT_TAG, g.VCComm()); + Int i = coord[0]; Int j = coord[1]; + + // post recv for data getVector.resize (count); T *getBuffer = getVector.data (); mpi::TaggedRecv (getBuffer, count, source, - DATA_PUT_TAG, g.VCComm()); + DATA_PUT_TAG, g.VCComm ()); + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (g.Row(), colAlign, r); const Int rowShift = Shift (g.Col(), rowAlign, c); @@ -1023,10 +1095,13 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) for (Int s = 0; s < localHeight; ++s) YCol[s] = XCol[s]; } - - getVector.clear(); } } + + // wait for my sends + WaitMatrix ( Z ); + + getVector.clear(); } template @@ -1110,12 +1185,14 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) receivingCol = (receivingCol + 1) % c; } + // progress my sends + TestMatrix (Z); + + std::vector recvVector_; // Receive all of the replies - while ( !TestMatrix (Z) ) + while ( 1 ) { mpi::Status status; - std::vector recvVector_; - if (mpi::IProbe (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { @@ -1157,13 +1234,13 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width - template +template void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForPut_ ) LogicError("Global matrix cannot be updated"); @@ -1187,22 +1264,13 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int colShift = Shift (myProcessRow, colAlign, r); - const Int rowShift = Shift (myProcessCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); - - Int matrix_index; + Int matrix_index, coord_index; Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; + + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; const Int YLDim = Y.LDim(); @@ -1210,13 +1278,16 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { const Int colShift = Shift( receivingRow, colAlign, r ); const Int rowShift = Shift( receivingCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); + const Int numEntries = localHeight * localWidth; if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; + T* XBuffer = Z.Buffer(); // send data const Int dindex = @@ -1230,6 +1301,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) numEntries) LogicError ("Error in NextIndexMatrix");) T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) mpi::TaggedISend (sendBuffer, numEntries, destination, DATA_ACC_TAG, g.VCComm(), matrices_[matrix_index].requests_[destination][dindex]); + + // send coordinates + const Int cindex = + NextIndexCoord (i, j, + destination, + XBuffer, + &coord_index); + + Int *coord = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); + coord[0] = i; coord[1] = j; + mpi::TaggedISend (coord, 2, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; @@ -1248,25 +1333,52 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) receivingCol = (receivingCol + 1) % c; } - mpi::Status status; - std::vector getVector; + // progress my sends + TestMatrix ( Z ); + TestCoord ( Z ); // my receives - while ( 1 ) + std::vector getVector; + bool flag = true; + + while ( flag ) { - if ( mpi::IProbe( mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm (), status ) ) - { + mpi::Status status; + flag = mpi::IProbe( mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm (), status); + + if ( flag ) + { const Int source = status.MPI_SOURCE; const Int count = mpi::GetCount (status); + // post receive for coordinates + Int coord[2]; + mpi::TaggedRecv (coord, 2, source, + COORD_ACC_TAG, g.VCComm()); + Int i = coord[0]; Int j = coord[1]; + + // post recv for data getVector.resize (count); T *getBuffer = getVector.data (); - + mpi::TaggedRecv (getBuffer, count, source, - DATA_ACC_TAG, g.VCComm()); + DATA_ACC_TAG, g.VCComm ()); + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + + const Int colShift = Shift (g.Row(), colAlign, r); + const Int rowShift = Shift (g.Col(), rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + for (Int t = 0; t < localWidth; ++t) { T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); @@ -1275,11 +1387,10 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) YCol[s] += XCol[s]; } } - - // test if my sends are completed - if ( TestMatrix ( Z ) ) - break; } + + // wait for my sends + WaitMatrix ( Z ); getVector.clear(); } diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 593cd3ca85..74863373c9 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -1472,9 +1472,41 @@ bool IProbe (int source, int tag, Comm comm, (source, tag, comm.comm, &flag, &status)); return flag; } + bool IProbe (int source, Comm comm, Status & status) { - return IProbe (source, 0, comm, status); + return IProbe (source, mpi::ANY_TAG, comm, status); +} + +bool IProbe (Comm comm, Status & status) +{ + return IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, comm, status); +} + +void Probe (int source, int tag, Comm comm, Status & status) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Probe")) + SafeMpi (MPI_Probe(source, tag, comm.comm, &status)); +} + +void Probe (int source, Comm comm, Status & status) +{ + Probe (source, mpi::ANY_TAG, comm, status); +} + +void Probe (Comm comm, Status & status) +{ + Probe (mpi::ANY_SOURCE, mpi::ANY_TAG, comm, status); +} + +bool IMprobe (int source, int tag, Comm comm, + Status & status, Message & message) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::IMprobe")) + int flag; + SafeMpi (MPI_Improbe + (source, tag, comm.comm, &flag, &message, &status)); + return flag; } template < typename T > int GetCount (Status & status) @@ -2032,6 +2064,51 @@ template void TaggedRecv (Complex < double >*buf, int count, int from, int tag, Comm comm); +// matching recv +template < typename R > +void TaggedMrecv (R * buf, int count, Message & msg) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Mrecv")) + Status status; + SafeMpi (MPI_Mrecv + (buf, count, TypeMap < R > (), + &msg, &status)); +} + +template < typename R > +void TaggedMrecv (Complex < R > *buf, int count, Message & msg) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Mrecv")) + Status status; +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi + (MPI_Mrecv + (buf, 2 * count, TypeMap < R > (), &msg, &status)); +#else + SafeMpi + (MPI_Mrecv + (buf, count, TypeMap < Complex < R >> (), + &msg, &status)); +#endif +} + +template void TaggedMrecv (byte * buf, int count, Message & msg); +template void TaggedMrecv (int *buf, int count, Message & msg); +template void TaggedMrecv (unsigned *buf, int count, Message & msg); +template void TaggedMrecv (long int *buf, int count, Message & msg); +template void TaggedMrecv (unsigned long *buf, int count, Message & msg); +#ifdef EL_HAVE_MPI_LONG_LONG +template void TaggedMrecv (long long int *buf, int count, Message & msg); +template void TaggedMrecv (unsigned long long *buf, + int count, Message & msg); +#endif +template void TaggedMrecv (float *buf, int count, Message & msg); +template void TaggedMrecv (double *buf, int count, Message & msg); +template void TaggedMrecv (Complex < float >*buf, + int count, Message & msg); +template void TaggedMrecv (Complex < double >*buf, + int count, Message & msg); + template < typename T > void Recv (T * buf, int count, int from, Comm comm) { From a8a70a80a1350aa74b9ef98410852a5d805ddc59 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 17 Sep 2014 02:31:27 -0500 Subject: [PATCH 084/110] three small test cases that tests axpy2 blocking and nb, and rma --- tests/Axpy2.cpp | 188 +++++++++++++++++++++++++++++++++++++++++++++++ tests/Axpy2b.cpp | 183 +++++++++++++++++++++++++++++++++++++++++++++ tests/Rma.cpp | 187 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 558 insertions(+) create mode 100644 tests/Axpy2.cpp create mode 100644 tests/Axpy2b.cpp create mode 100644 tests/Rma.cpp diff --git a/tests/Axpy2.cpp b/tests/Axpy2.cpp new file mode 100644 index 0000000000..ee42728cd7 --- /dev/null +++ b/tests/Axpy2.cpp @@ -0,0 +1,188 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then Flush all, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This is implemented using MPI-3 + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 10 +//#define DIM 1000 +//#define AXPY_DIM 100 +#define DIM 20 +#define AXPY_DIM 4 + +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + AxpyInterface2 < double > Axpy2int; + Axpy2int.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Axpy2int.Iacc (B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Flush all operations from B to DistMatrix + Axpy2int.Flush ( B ); + + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Axpy2int.Iget (C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + Axpy2int.Flush ( C ); + // Collectively detach in order to finish filling process 0's request + Axpy2int.Detach (); + +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + /* + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of A"); + } + } + mpi::Barrier ( comm ); + */ +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} diff --git a/tests/Axpy2b.cpp b/tests/Axpy2b.cpp new file mode 100644 index 0000000000..0ee0ebcaa8 --- /dev/null +++ b/tests/Axpy2b.cpp @@ -0,0 +1,183 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then a Barrier, + * then another epoch where all the ranks + * perform Get (on their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This is implemented using MPI-3 + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 10 +//#define DIM 1000 +//#define AXPY_DIM 100 +#define DIM 20 +#define AXPY_DIM 4 + +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + AxpyInterface2 < double > Axpy2int; + Axpy2int.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + //Axpy2int.Put (B, i, j); + Axpy2int.Acc (B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + //Axpy2int.Get (C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Collectively detach in order to finish filling process 0's request + Axpy2int.Detach (); +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + /* + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of C"); + } + } + mpi::Barrier ( comm ); + */ +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} diff --git a/tests/Rma.cpp b/tests/Rma.cpp new file mode 100644 index 0000000000..26f8647056 --- /dev/null +++ b/tests/Rma.cpp @@ -0,0 +1,187 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then Flush all, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This is implemented using MPI-3 + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 10 +//#define DIM 1000 +//#define AXPY_DIM 100 +#define DIM 20 +#define AXPY_DIM 4 + +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + RmaInterface < double > Rmaint; + Rmaint.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Acc (B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Flush all operations from B to DistMatrix + Rmaint.Flush ( B ); + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Get (C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Get doesn't require flush though + //Rmaint.Flush ( C ); + // Collectively detach in order to finish filling process 0's request + Rmaint.Detach (); +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + /* + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of A"); + } + } + mpi::Barrier ( comm ); + */ +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} From 63f2f224e3a812fbc8f8ea00c57a2771d61a684e Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 17 Sep 2014 02:32:59 -0500 Subject: [PATCH 085/110] adding a sample makefile for tests, will remove it in future though --- tests/Makefile.sample | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 tests/Makefile.sample diff --git a/tests/Makefile.sample b/tests/Makefile.sample new file mode 100644 index 0000000000..4a7920a568 --- /dev/null +++ b/tests/Makefile.sample @@ -0,0 +1,28 @@ +VG_PATH = /home/sg/builds/valgrind +EL_PATH = /home/sg/builds/Elemental-updated +MPILOC = /home/sg/builds/mpich +CPPFLAGS = -g -O3 -pthread -DDEBUG=2 -std=c++11 -Wall -Wno-unused-variable +SRCS = $(wildcard *.cpp) +INCLUDES = -I$(MPILOC)/include -I$(EL_PATH)/include +LINK = -Wl,-rpath=$(EL_PATH)/lib -L$(EL_PATH)/lib -Wl,-rpath=$(MPILOC)/lib -L$(MPILOC)/lib +LIBS = -llapack -lblas -lrt -lm -lmpich -lopa -lmpl -lEl -lpmrrr +NAME = rmaaxpy +NPROCS = 4 + +all: $(NAME) + +$(NAME) : $(SRCS) + $(CXX) $(CPPFLAGS) $(INCLUDES) -o $(NAME) $? $(LINK) $(LIBS) +clean: + rm -f $(NAME) + +run: + $(MPILOC)/bin/mpiexec -n $(NPROCS) ./$(NAME) + +distclean: clean +profclean: + rm -f $(NAME).hpcstruct + rm -rf hpctoolkit-$(NAME)-measurements/ + rm -rf hpctoolkit-$(NAME)-database/ + rm -rf hpctoolkit-$(NAME)-database-*/ + rm -rf workspace/ From 1bfa059da130c66ef0593f24fc7d3b18a78f2896 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 8 Oct 2014 00:40:29 -0500 Subject: [PATCH 086/110] modified most of axpy2.0, got to fix get tomorrow --- include/El/core/AxpyInterface2.0.hpp | 91 +- include/El/core/imports/mpi.hpp | 6 + src/core/AxpyInterface.cpp | 1 + src/core/AxpyInterface2.0.cpp | 1196 +++++++++++++------------- src/core/imports/mpi.cpp | 67 ++ 5 files changed, 719 insertions(+), 642 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 97526390f5..6d5a9950e5 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -55,75 +55,42 @@ class AxpyInterface2 COORD_ACC_TAG =5, COORD_PUT_TAG =6; - // struct for passing data - struct matrix_params_ - { - T *base_; - std::vector>> - data_; - std::vector> - requests_; - std::vector> - statuses_; - }; - - std::vector matrices_; - - // struct for passing coordinates - struct coord_params_ - { - T *base_; - std::vector>> - coord_; - std::vector> - requests_; - std::vector> - statuses_; - }; - - std::vector coords_; - + // request statuses + std::vector> + sendDataStatuses_, sendCoordStatuses_, + recvDataStatuses_, recvCoordStatuses_; + + // request handles + std::vector> + sendDataRequests_, sendCoordRequests_, + recvDataRequests_, recvCoordRequests_; + + // data + std::vector>> + sendData_, recvData_; + + // coords + std::vector>> + sendCoord_, recvCoord_; + // TODO need to add const here... DistMatrix* GlobalArrayPut_; DistMatrix* GlobalArrayGet_; bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; - - Int NextIndexMatrix ( - Int target, - Int dataSize, - T * base_address, - Int *matrix_index); - - Int NextIndexMatrix ( - Int target, - Int dataSize, - const T * base_address, - Int *matrix_index); - - Int NextIndexCoord ( - Int i, Int j, - Int target, - T * base_address, - Int *matrix_index); - - Int NextIndexCoord ( - Int i, Int j, - Int target, - const T * base_address, - Int *matrix_index); - - /* Test */ - bool TestMatrix ( Matrix& Z ); - bool TestMatrix ( const Matrix& Z ); - - bool TestCoord ( Matrix& Z ); - bool TestCoord ( const Matrix& Z ); - /* Wait */ - void WaitMatrix ( Matrix& Z ); - void WaitMatrix ( const Matrix& Z ); + // next index for data and coord + Int NextIndexData + ( Int dataSize, + std::deque>& data, + std::deque& requests, + std::deque& requestStatuses ); + + Int NextIndexCoord + ( std::deque>& coord, + std::deque& requests, + std::deque& requestStatuses ); // these are only used for nonblocking // update rountines diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index cdf2ccbc2b..c45b86d061 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -243,6 +243,8 @@ void SetWindowProp ( Window& window, int prop ); void CheckBounds ( Window & window, mpi::Datatype win_type, mpi::Datatype type, size_t count, ptrdiff_t target_offset ); void RmaProgress ( Comm comm ); +long ReadInc (Window & win, Aint offset, + long inc, int fop_root); // strided/vector to datatype void StridedDatatype (El_strided_t* stride_descr, @@ -446,6 +448,10 @@ template void TaggedMrecv( R* buf, int count, Message & message ); template void TaggedMrecv( Complex* buf, int count, Message & message ); +template +void TaggedRecvS( R* buf, int count, int from, int tag, Comm comm, Status & status ); +template +void TaggedRecvS( Complex* buf, int count, int from, int tag, Comm comm, Status & status ); // Non-blocking recv // ----------------- diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 840a3d4d48..3ef28f35d3 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -541,6 +541,7 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) ReadyForSend (bufferSize, dataVectors_[destination], dataSendRequests_[destination], sendingData_[destination]); + DEBUG_ONLY (if (Int (dataVectors_[destination][index].size ()) != bufferSize) LogicError ("Error in ReadyForSend");) diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 2abc46d1ec..41cee1f5bd 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -6,17 +6,23 @@ which can be found in the LICENSE file in the root directory, or at #include "El-lite.hpp" #include +#if MPI_VERSION>=3 // TODO Use DDT for put/get/acc when EL_USE_DERIVED_TYPE is defined // TODO bring back const interfaces namespace El { -template -AxpyInterface2::AxpyInterface2() - : GlobalArrayPut_(0), GlobalArrayGet_(0), - matrices_(0), coords_(0), - toBeAttachedForGet_(false), toBeAttachedForPut_(false), - attached_(false), detached_(false) -{ } + template + AxpyInterface2::AxpyInterface2() + : GlobalArrayPut_(0), GlobalArrayGet_(0), + sendDataStatuses_(0), sendCoordStatuses_(0), + recvDataStatuses_(0), recvCoordStatuses_(0), + sendDataRequests_(0), sendCoordRequests_(0), + recvDataRequests_(0), recvCoordRequests_(0), + sendData_(0), recvData_(0), + sendCoord_(0), recvCoord_(0), + toBeAttachedForGet_(false), toBeAttachedForPut_(false), + attached_(false), detached_(false) + { } template AxpyInterface2::AxpyInterface2( DistMatrix& Z ) @@ -31,30 +37,24 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) GlobalArrayGet_ = &Z; const Grid& g = Z.Grid(); - const Int p = g.Size (); + const Int p = g.Size (); - if ( matrices_.empty() ) - { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - } - - if ( coords_.empty() ) + if ( sendData_.empty() ) { - struct coord_params_ cp; - cp.coord_.resize(p); - cp.requests_.resize(p); - cp.statuses_.resize(p); - cp.base_ = NULL; - // push back new matrix_params created - // with default constructor - coords_.push_back( cp ); + sendDataStatuses_.resize (p); + sendCoordStatuses_.resize (p); + recvDataStatuses_.resize (p); + recvCoordStatuses_.resize (p); + + sendDataRequests_.resize (p); + sendCoordRequests_.resize (p); + recvDataRequests_.resize (p); + recvCoordRequests_.resize (p); + + sendData_.resize (p); + sendCoord_.resize (p); + recvData_.resize (p); + recvCoord_.resize (p); } } @@ -79,17 +79,17 @@ AxpyInterface2::~AxpyInterface2() } } -template + template void AxpyInterface2::Attach( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Attach")) - // attached_ will be only set in Attach - // and only unset in Detach - if (!attached_) - attached_ = true; - else - LogicError("Must detach before reattaching."); - + // attached_ will be only set in Attach + // and only unset in Detach + if (!attached_) + attached_ = true; + else + LogicError("Must detach before reattaching."); + // the matrix base_ is not known until // an update operation (put/get/acc) // so it is kept blank @@ -106,197 +106,100 @@ void AxpyInterface2::Attach( DistMatrix& Z ) const Grid& g = Z.Grid(); const Int p = g.Size (); - if ( matrices_.empty() ) + if ( sendData_.empty() ) { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - } - - if ( coords_.empty() ) - { - struct coord_params_ cp; - cp.coord_.resize(p); - cp.requests_.resize(p); - cp.statuses_.resize(p); - cp.base_ = NULL; - // push back new matrix_params created - // with default constructor - coords_.push_back( cp ); + sendDataStatuses_.resize (p); + sendCoordStatuses_.resize (p); + recvDataStatuses_.resize (p); + recvCoordStatuses_.resize (p); + + sendDataRequests_.resize (p); + sendCoordRequests_.resize (p); + recvDataRequests_.resize (p); + recvCoordRequests_.resize (p); + + sendData_.resize (p); + sendCoord_.resize (p); + recvData_.resize (p); + recvCoord_.resize (p); } } } template -Int AxpyInterface2::NextIndexMatrix ( - Int target, - Int dataSize, - T * base_address, - Int *mindex) -{ - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexMatrix")) - - assert ( base_address != NULL ); - - Int matrixIndex = 0; - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - const Int numMatrices = matrices_.size(); - - // search for matrix base - for (Int m = 0; m < numMatrices; m++) - { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - if ( matrices_[m].base_ == NULL ) - { - matrices_[m].base_ = base_address; - matrixIndex = m; - break; - } - matrixIndex = m+1; - } - - // need to create new object - if ( matrixIndex == numMatrices) - { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - matrices_[matrixIndex].base_ = base_address; - } - // go through the request, data, - // status objects - const Int numCreated = matrices_[matrixIndex].data_[target].size (); - DEBUG_ONLY (if (numCreated != Int (matrices_[matrixIndex].requests_[target].size ()) || - numCreated != Int (matrices_[matrixIndex].statuses_[target].size ())) - LogicError ("size mismatch");) - + Int AxpyInterface2::NextIndexData + (Int dataSize, + std::deque < std::vector < T >> &data, + std::deque < mpi::Request > &requests, + std::deque < bool > &requestStatuses) + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexData")) + const Int numCreated = data.size (); + DEBUG_ONLY (if (numCreated != Int (requests.size ()) || + numCreated != + Int (requestStatuses.size ()))LogicError + ("size mismatch");) for (Int i = 0; i < numCreated; ++i) { - // If this request is still running, - // test to see if it finished. - if (matrices_[matrixIndex].statuses_[target][i]) + // If this request is still running, test to see if it finished. + if (requestStatuses[i]) { - const bool finished = mpi::Test (matrices_[matrixIndex].requests_[target][i]); - matrices_[matrixIndex].statuses_[target][i] = !finished; + const bool finished = mpi::Test (requests[i]); + requestStatuses[i] = !finished; } - if (!matrices_[matrixIndex].statuses_[target][i]) + if (!requestStatuses[i]) { - matrices_[matrixIndex].statuses_[target][i] = true; - matrices_[matrixIndex].data_[target][i].resize ( dataSize ); - *mindex = matrixIndex; - return i; + requestStatuses[i] = true; + data[i].resize (dataSize); + return i; } } - matrices_[matrixIndex].data_[target].resize ( numCreated + 1 ); - matrices_[matrixIndex].data_[target][numCreated].resize ( dataSize ); - matrices_[matrixIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); - matrices_[matrixIndex].statuses_[target].push_back ( true ); - *mindex = matrixIndex; - + data.resize (numCreated + 1); + data[numCreated].resize (dataSize); + requests.push_back (mpi::REQUEST_NULL); + requestStatuses.push_back (true); + return numCreated; -} + } template -Int AxpyInterface2::NextIndexCoord ( - Int i, Int j, - Int target, - T * base_address, - Int *cindex) -{ + Int AxpyInterface2::NextIndexCoord + (std::deque < std::array > &coord, + std::deque < mpi::Request > &requests, + std::deque < bool > &requestStatuses) + { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexCoord")) - assert ( base_address != NULL ); - - Int coordIndex = 0; - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - const Int numCoords = coords_.size(); - - // search for matrix base - for (Int m = 0; m < numCoords; m++) - { - if ( coords_[m].base_ == base_address ) - { - coordIndex = m; - break; - } - if ( coords_[m].base_ == NULL ) - { - coords_[m].base_ = base_address; - coordIndex = m; - break; - } - coordIndex = m+1; - } - - // need to create new object - if ( coordIndex == numCoords ) - { - struct coord_params_ cp; - cp.coord_.resize(p); - cp.requests_.resize(p); - cp.statuses_.resize(p); - cp.base_ = NULL; - // push back new matrix_params created - // with default constructor - coords_.push_back( cp ); - coords_[coordIndex].base_ = base_address; - } - // go through the request, data, - // status objects - const Int numCreated = coords_[coordIndex].coord_[target].size (); - DEBUG_ONLY (if (numCreated != Int (coords_[coordIndex].requests_[target].size ()) || - numCreated != Int (matrices_[coordIndex].statuses_[target].size ())) - LogicError ("size mismatch");) - + const Int numCreated = coord.size (); + DEBUG_ONLY (if (numCreated != Int (requests.size ()) || + numCreated != + Int (requestStatuses.size ()))LogicError + ("size mismatch");) + for (Int i = 0; i < numCreated; ++i) { - // If this request is still running, - // test to see if it finished. - if (coords_[coordIndex].statuses_[target][i]) + // If this request is still running, test to see if it finished. + if (requestStatuses[i]) { - const bool finished = mpi::Test (coords_[coordIndex].requests_[target][i]); - coords_[coordIndex].statuses_[target][i] = !finished; + const bool finished = mpi::Test (requests[i]); + requestStatuses[i] = !finished; } - if (!coords_[coordIndex].statuses_[target][i]) + if (!requestStatuses[i]) { - coords_[coordIndex].statuses_[target][i] = true; - coords_[coordIndex].coord_[target][i][0] = i; - coords_[coordIndex].coord_[target][i][1] = j; - *cindex = coordIndex; - return i; + requestStatuses[i] = true; + return i; } } - coords_[coordIndex].coord_[target].resize ( numCreated + 1 ); - coords_[coordIndex].coord_[target][numCreated][0] = i; - coords_[coordIndex].coord_[target][numCreated][1] = j; - coords_[coordIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); - coords_[coordIndex].statuses_[target].push_back ( true ); - *cindex = coordIndex; - + coord.resize (numCreated + 1); + requests.push_back (mpi::REQUEST_NULL); + requestStatuses.push_back (true); + return numCreated; -} + } template void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) @@ -332,8 +235,6 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) const Int YLDim = Y.LDim (); - Int matrix_index, coord_index; - for( Int step=0; step::Iput( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - T* XBuffer = Z.Buffer(); + const T* XBuffer = Z.LockedBuffer(); + const Int dindex = - NextIndexMatrix (destination, - numEntries, - XBuffer, - &matrix_index); + NextIndexData (numEntries, + sendData_[destination], + sendDataRequests_[destination], + sendDataStatuses_[destination]); DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexMatrix");) + (Int (sendData_[destination][dindex].size ()) != + numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + T *sendBuffer = sendData_[destination][dindex].data (); for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data ()); - coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, destination, COORD_PUT_TAG, g.VCComm (), - coords_[coord_index].requests_[destination][cindex]); + NextIndexCoord (sendCoord_[destination], + sendCoordRequests_[destination], + sendCoordStatuses_[destination]); + + Int *coord = reinterpret_cast(sendCoord_[destination][cindex].data ()); + coord[0] = i; + coord[1] = j; + coord[2] = numEntries; + + mpi::TaggedISend (coord, 3, destination, COORD_PUT_TAG, g.VCComm (), + sendCoordRequests_[destination][cindex]); } + receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; @@ -411,24 +316,25 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) const Int p = g.Size (); std::vector recvVector_; - Int coord_index; - T* XBuffer = Z.Buffer(); + const T* XBuffer = Z.LockedBuffer(); // Send out the requests to all processes in the grid for (Int rank = 0; rank < p; ++rank) { - // send coordinates, no need to send a separate - // request object + // send coordinates const Int cindex = - NextIndexCoord (i, j, - rank, - XBuffer, - &coord_index); - Int *coord = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data ()); - coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, rank, + NextIndexCoord (sendCoord_[rank], + sendCoordRequests_[rank], + sendCoordStatuses_[rank]); + + Int *coord = reinterpret_cast(sendCoord_[rank][cindex].data ()); + coord[0] = i; + coord[1] = j; + coord[2] = -1; + + mpi::TaggedISend (coord, 3, rank, REQUEST_GET_TAG, g.VCComm (), - coords_[coord_index].requests_[rank][cindex]); + sendCoordRequests_[rank][cindex]); } // Receive all of the replies @@ -525,171 +431,54 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - T* XBuffer = Z.Buffer(); + const T* XBuffer = Z.LockedBuffer(); // send data const Int dindex = - NextIndexMatrix (destination, - numEntries, - XBuffer, - &matrix_index); + NextIndexData (numEntries, + sendData_[destination], + sendDataRequests_[destination], + sendDataStatuses_[destination]); DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexMatrix");) + (Int (sendData_[destination][dindex].size ()) != + numEntries) LogicError ("Error in NextIndexData");) - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + T *sendBuffer = sendData_[destination][dindex].data (); for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, destination, + NextIndexCoord (sendCoord_[destination], + sendCoordRequests_[destination], + sendCoordStatuses_[destination]); + + Int *coord = reinterpret_cast(sendCoord_[destination][cindex].data()); + coord[0] = i; + coord[1] = j; + coord[2] = numEntries; + + mpi::TaggedISend (coord, 3, destination, COORD_ACC_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); + sendCoordRequests_[destination][cindex]); } + receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } } -// wait -template -void AxpyInterface2::WaitMatrix ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - const Int numMatrices = matrices_.size(); - Int matrixIndex; - const T *base_address = Z.LockedBuffer(); - - // search for matrix base - for (Int m = 0; m < numMatrices; m++) - { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; - } - // matrix not found - if ( matrixIndex == numMatrices) - return; - - for (int rank = 0; rank < p; ++rank) - { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - const Int numRequests = matrices_[matrixIndex].requests_[rank].size (); - for (int i = 0; i < numRequests; i++) - { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); - matrices_[matrixIndex].statuses_[rank][i] = false; - } - } -} - -// progress communication for a particular matrix -// progress requests -template -bool AxpyInterface2::TestMatrix ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - const Int numMatrices = matrices_.size(); - Int matrixIndex; - const T *base_address = Z.LockedBuffer(); - - // search for matrix base - for (Int m = 0; m < numMatrices; m++) - { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; - } - - // matrix not found - if ( matrixIndex == numMatrices) - return true; - - for (int rank = 0; rank < p; ++rank) - { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - const Int numStatuses = matrices_[matrixIndex].requests_[rank].size (); - for (int i = 0; i < numStatuses; i++) - { - matrices_[matrixIndex].statuses_[rank][i] = !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); - if ( matrices_[matrixIndex].statuses_[rank][i] ) - return false; - } - } - return true; -} - -template -bool AxpyInterface2::TestCoord ( Matrix& Z ) -{ - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int p = g.Size(); - const Int numCoords = coords_.size(); - Int coordIndex; - const T *base_address = Z.LockedBuffer(); - - // search for coord base - for (Int m = 0; m < numCoords; m++) - { - if ( coords_[m].base_ == base_address ) - { - coordIndex = m; - break; - } - coordIndex = m+1; - } - - // coord not found - if ( coordIndex == numCoords) - return true; - - for (int rank = 0; rank < p; ++rank) - { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) - continue; - const Int numStatuses = coords_[coordIndex].requests_[rank].size (); - for (int i = 0; i < numStatuses; i++) - { - coords_[coordIndex].statuses_[rank][i] = !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); - if ( coords_[coordIndex].statuses_[rank][i] ) - return false; - } - } - return true; -} - // flush ensures local and remote completion // this interface assumes a send has been issued // and will post a matching receive and progress @@ -702,53 +491,85 @@ void AxpyInterface2::Flush( Matrix& Z ) DistMatrix& Y = *GlobalArrayPut_; const Grid& g = Y.Grid(); - - mpi::Status status; - bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; - - while ( !DONE ) + const Int p = g.Size (); + bool _sendDataStatus, _sendCoordStatus, + _recvDataStatus, _recvCoordStatus; + + while ( 1 ) { - // similar to HandleXYZ functions in original AxpyInterface - if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm(), status) ) + mpi::Status status; + if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm(), status) ) + { + const Int count = mpi::GetCount (status); + HandleLocalToGlobalData ( Z, count, status.MPI_SOURCE ); + } + + if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm(), status) ) { - switch (status.MPI_TAG) - { - case DATA_PUT_TAG: - { - const Int count = mpi::GetCount (status); - HandleLocalToGlobalData ( Z, count, status.MPI_SOURCE ); - break; - } - case DATA_ACC_TAG: - { - const Int count = mpi::GetCount (status); - HandleLocalToGlobalAcc ( Z, count, status.MPI_SOURCE ); - break; - } - case REQUEST_GET_TAG: - { - HandleGlobalToLocalData ( Z ); - break; - } - } + const Int count = mpi::GetCount (status); + HandleLocalToGlobalAcc ( Z, count, status.MPI_SOURCE ); } - - if ( nb_bar_active ) + + if ( mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm(), status) ) { - DONE = mpi::Test ( nb_bar_request ); + HandleGlobalToLocalData ( Z ); } - else + + //test for completion + for (Int i = 0; i < p; ++i) { - // check if all sends (data or request) are - // complete for a particular matrix - if ( TestMatrix( Z ) && TestCoord( Z ) ) + // data sends + const Int numsendDataRequests = sendDataRequests_[i].size (); + for (Int j = 0; j < numsendDataRequests; ++j) { - mpi::IBarrier ( g.VCComm(), nb_bar_request ); - nb_bar_active = true; + if (sendDataStatuses_[i][j]) + sendDataStatuses_[i][j] = !mpi::Test (sendDataRequests_[i][j]); + if (!sendDataStatuses_[i][j]) + _sendDataStatus = true; + else + _sendDataStatus = false; } + + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[i].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + { + if (sendCoordStatuses_[i][j]) + sendCoordStatuses_[i][j] = !mpi::Test (sendCoordRequests_[i][j]); + if (!sendCoordStatuses_[i][j]) + _sendCoordStatus = true; + else + _sendCoordStatus = false; + } + + // data recvs + const Int numrecvDataRequests = recvDataRequests_[i].size (); + for (Int j = 0; j < numrecvDataRequests; ++j) + { + if (recvDataStatuses_[i][j]) + recvDataStatuses_[i][j] = !mpi::Test (recvDataRequests_[i][j]); + if (!recvDataStatuses_[i][j]) + _recvDataStatus = true; + else + _recvDataStatus = false; + } + + // coord sends + const Int numrecvCoordRequests = recvCoordRequests_[i].size (); + for (Int j = 0; j < numrecvCoordRequests; ++j) + { + if (recvCoordStatuses_[i][j]) + recvCoordStatuses_[i][j] = !mpi::Test (recvCoordRequests_[i][j]); + if (!recvCoordStatuses_[i][j]) + _recvCoordStatus = true; + else + _recvCoordStatus = false; + } } + + if (_sendDataStatus && _sendCoordStatus && + _recvDataStatus && _recvCoordStatus) + break; } } @@ -773,8 +594,8 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int s LogicError ("Not enough space allocated");) // post receive for coordinates - Int coord[2]; - mpi::TaggedRecv (coord, 2, source, + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, COORD_PUT_TAG, g.VCComm()); Int i = coord[0]; Int j = coord[1]; @@ -831,10 +652,12 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so LogicError ("Not enough space allocated");) // post receive for coordinates - Int coord[2]; - mpi::TaggedRecv (coord, 2, source, + // we don't need coord[2], i.e numEntries + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, COORD_ACC_TAG, g.VCComm()); - Int i = coord[0]; Int j = coord[1]; + Int i = coord[0]; + Int j = coord[1]; // post receive for data T *getBuffer = getVector_.data(); @@ -890,10 +713,13 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int source = status.MPI_SOURCE; // post receive for coordinates - Int coord[2] = {-1, -1}; - mpi::TaggedRecv (coord, 2, source, + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, REQUEST_GET_TAG, g.VCComm()); - Int i = coord[0]; Int j = coord[1]; + Int i = coord[0]; + Int j = coord[1]; + // we calculate numEntries anyway, so + // coord[2] is not required const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; @@ -918,16 +744,17 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) T* XBuffer = Z.Buffer(); const Int index = - NextIndexMatrix (source, - numEntries, - XBuffer, - &matrix_index); + NextIndexData (numEntries, + sendData_[source], + sendDataRequests_[source], + sendDataStatuses_[source]); DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndexMatrix");) + (Int (sendData_[source][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + + T *replyBuffer = sendData_[source][index].data (); - T *replyBuffer = matrices_[matrix_index].data_[source][index].data (); for (Int t = 0; t < localWidth; ++t) { T *sendCol = &replyBuffer[t * localHeight]; @@ -938,7 +765,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) // Fire off non-blocking send mpi::TaggedISend (replyBuffer, numEntries, source, DATA_GET_TAG, g.VCComm (), - matrices_[matrix_index].requests_[source][index]); + sendDataRequests_[source][index]); } } @@ -970,19 +797,37 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) const Int c = g.Width(); const Int p = g.Size(); + std::vector dataindices_; + dataindices_.resize (p); + const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - Int matrix_index, coord_index; - + const T* XBuffer = Z.LockedBuffer(); + + // prepost receives for coordinates + for ( int rank = 0; rank < p; ++rank ) + { + const Int index = + NextIndexCoord (recvCoord_[rank], + recvCoordRequests_[rank], + recvCoordStatuses_[rank]); + + dataindices_[rank] = index; + Int *coord_ = recvCoord_[rank][index].data(); + mpi::TaggedIRecv (coord_, 3, rank, COORD_PUT_TAG, g.VCComm(), + recvCoordRequests_[rank][index]); + } + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - + const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; const Int YLDim = Y.LDim(); - + + // send coordinates and data size for( Int step=0; step::Put( Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); + + const Int numEntries = localHeight * localWidth; + + // target rank + const Int destination = receivingRow + r*receivingCol; + const Int index = + NextIndexCoord (sendCoord_[destination], + sendCoordRequests_[destination], + sendCoordStatuses_[destination]); + + int * coord_ = sendCoord_[destination][index].data (); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + + // post receive for coordinates + mpi::TaggedISend (coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + sendCoordRequests_[destination][index]); + + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } + + // wait for my coordinates xfer to be over + for (Int i = 0; i < p; ++i) + { + // coord receives + const Int numRecvCoordRequests = recvCoordRequests_[i].size (); + for (Int j = 0; j < numRecvCoordRequests; ++j) + { + if (recvCoordStatuses_[i][j]) + { + mpi::Wait (recvCoordRequests_[i][j]); + recvCoordStatuses_[i][j] = false; + } + } + // coord sends + const Int numSendCoordRequests = sendCoordRequests_[i].size (); + for (Int j = 0; j < numSendCoordRequests; ++j) + { + if (sendCoordStatuses_[i][j]) + { + mpi::Wait (recvCoordRequests_[i][j]); + sendCoordStatuses_[i][j] = false; + } + } + } + + // prepost receives for data + // should be some way to get the index! + for ( int rank = 0; rank < p; ++rank ) + { + const int index = dataindices_[rank]; + const int i = recvCoord_[rank][index][0]; + const int j = recvCoord_[rank][index][1]; + const int numEntries = recvCoord_[rank][index][2]; + + // post recv for data + if ( numEntries > 0 ) + { + const Int index = + NextIndexData (numEntries, + recvData_[rank], + recvDataRequests_[rank], + recvDataStatuses_[rank]); + + DEBUG_ONLY (if + (Int (recvData_[rank][index].size ()) != + numEntries) LogicError ("Error in NextIndexData");) + + T *recvData = recvData_[rank][index].data (); + + mpi::TaggedIRecv (recvData, numEntries, rank, + DATA_PUT_TAG, g.VCComm (), + recvDataRequests_[rank][index]); + } + } + + // sends for data + receivingRow = myProcessRow; + receivingCol = myProcessCol; + + for( Int step=0; step 0 ) { - const Int destination = receivingRow + r*receivingCol; + // target rank + const Int destination = receivingRow + r*receivingCol; - T* XBuffer = Z.Buffer(); - // send data - const Int dindex = - NextIndexMatrix (destination, - numEntries, - XBuffer, - &matrix_index); + const Int index = + NextIndexData (numEntries, + sendData_[destination], + sendDataRequests_[destination], + sendDataStatuses_[destination]); DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexMatrix");) + (Int (sendData_[destination][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + + T *sendBuffer = sendData_[destination][index].data (); - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, destination, - COORD_PUT_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); + sendDataRequests_[destination][index]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + receivingCol = (receivingCol + 1) % c; } - // progress my sends - TestMatrix ( Z ); - TestCoord ( Z ); - - // my receives - std::vector getVector; - bool flag = true; - - while ( flag ) + // wait for my data xfer to be over + for (Int i = 0; i < p; ++i) { - mpi::Status status; - flag = mpi::IProbe( mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm (), status); + // data receives + const Int numrecvDataRequests = recvDataRequests_[i].size (); + for (Int j = 0; j < numrecvDataRequests; ++j) + { + if (recvDataStatuses_[i][j]) + { + mpi::Wait (recvDataRequests_[i][j]); + recvDataStatuses_[i][j] = false; + } + } + // data sends + const Int numsendDataRequests = sendDataRequests_[i].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + { + if (sendDataStatuses_[i][j]) + { + mpi::Wait (sendDataRequests_[i][j]); + sendDataStatuses_[i][j] = false; + } + } + } + + // accumulate as data xfer is over + // there must be a way to get index + for ( int rank = 0; rank < p; ++rank ) + { + const int index = dataindices_[rank]; + const int i = recvCoord_[rank][index][0]; + const int j = recvCoord_[rank][index][1]; + const int numEntries = recvCoord_[rank][index][2]; - if ( flag ) + // data recv'd, now accumulate + if ( numEntries > 0 ) { - const Int source = status.MPI_SOURCE; - const Int count = mpi::GetCount (status); - - // post receive for coordinates - Int coord[2]; - mpi::TaggedRecv (coord, 2, source, - COORD_PUT_TAG, g.VCComm()); - Int i = coord[0]; Int j = coord[1]; - - // post recv for data - getVector.resize (count); - T *getBuffer = getVector.data (); - - mpi::TaggedRecv (getBuffer, count, source, - DATA_PUT_TAG, g.VCComm ()); - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); - + const T *Buffer = reinterpret_cast < const T * >(recvData_[rank][index].data()); + const Int colAlign = (Y.ColAlign () + i) % r; const Int rowAlign = (Y.RowAlign () + j) % c; @@ -1091,17 +1021,14 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) for (Int t = 0; t < localWidth; ++t) { T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; + const T *XCol = &Buffer[t * localHeight]; for (Int s = 0; s < localHeight; ++s) YCol[s] = XCol[s]; } } - } - - // wait for my sends - WaitMatrix ( Z ); - - getVector.clear(); + } + + dataindices_.clear(); } template @@ -1126,8 +1053,6 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) const Int c = g.Width (); const Int p = g.Size (); - Int matrix_index; - const Int XLDim = Z.LDim(); const Int colAlign = (X.ColAlign() + i) % r; const Int rowAlign = (X.RowAlign() + j) % c; @@ -1156,16 +1081,16 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) T* XBuffer = Z.Buffer(); const Int index = - NextIndexMatrix (source, - numEntries, - XBuffer, - &matrix_index); + NextIndexData (numEntries, + sendData_[source], + sendDataRequests_[source], + sendDataStatuses_[source]); DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndexMatrix");) + (Int (sendData_[source][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) - T *replyBuffer = matrices_[matrix_index].data_[source][index].data (); + T *replyBuffer = sendData_[source][index].data (); for (Int t = 0; t < localWidth; ++t) { @@ -1177,18 +1102,16 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // Fire off non-blocking send mpi::TaggedISend (replyBuffer, numEntries, source, DATA_GET_TAG, g.VCComm (), - matrices_[matrix_index].requests_[source][index]); + sendDataRequests_[source][index]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - - // progress my sends - TestMatrix (Z); std::vector recvVector_; + // Receive all of the replies while ( 1 ) { @@ -1261,19 +1184,37 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int c = g.Width(); const Int p = g.Size(); + std::vector dataindices_; + dataindices_.resize (p); + const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - Int matrix_index, coord_index; - + const T* XBuffer = Z.LockedBuffer(); + + // prepost receives for coordinates + for ( int rank = 0; rank < p; ++rank ) + { + const Int index = + NextIndexCoord (recvCoord_[rank], + recvCoordRequests_[rank], + recvCoordStatuses_[rank]); + + dataindices_[rank] = index; + Int *coord_ = recvCoord_[rank][index].data(); + mpi::TaggedIRecv (coord_, 3, rank, COORD_PUT_TAG, g.VCComm(), + recvCoordRequests_[rank][index]); + } + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - + const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; const Int YLDim = Y.LDim(); - + + // send coordinates and data size for( Int step=0; step::Acc( Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); + + const Int numEntries = localHeight * localWidth; + + // target rank + const Int destination = receivingRow + r*receivingCol; + const Int index = + NextIndexCoord (sendCoord_[destination], + sendCoordRequests_[destination], + sendCoordStatuses_[destination]); + + int * coord_ = sendCoord_[destination][index].data (); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + + // post receive for coordinates + mpi::TaggedISend (coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + sendCoordRequests_[destination][index]); + + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } + + // wait for my coordinates xfer to be over + for (Int i = 0; i < p; ++i) + { + // coord receives + const Int numRecvCoordRequests = recvCoordRequests_[i].size (); + for (Int j = 0; j < numRecvCoordRequests; ++j) + { + if (recvCoordStatuses_[i][j]) + { + mpi::Wait (recvCoordRequests_[i][j]); + recvCoordStatuses_[i][j] = false; + } + } + // coord sends + const Int numSendCoordRequests = sendCoordRequests_[i].size (); + for (Int j = 0; j < numSendCoordRequests; ++j) + { + if (sendCoordStatuses_[i][j]) + { + mpi::Wait (recvCoordRequests_[i][j]); + sendCoordStatuses_[i][j] = false; + } + } + } + + // prepost receives for data + // should be some way to get the index! + for ( int rank = 0; rank < p; ++rank ) + { + const int index = dataindices_[rank]; + const int i = recvCoord_[rank][index][0]; + const int j = recvCoord_[rank][index][1]; + const int numEntries = recvCoord_[rank][index][2]; + + // post recv for data + if ( numEntries > 0 ) + { + const Int index = + NextIndexData (numEntries, + recvData_[rank], + recvDataRequests_[rank], + recvDataStatuses_[rank]); + + DEBUG_ONLY (if + (Int (recvData_[rank][index].size ()) != + numEntries) LogicError ("Error in NextIndexData");) + + T *recvData = recvData_[rank][index].data (); + + mpi::TaggedIRecv (recvData, numEntries, rank, + DATA_PUT_TAG, g.VCComm (), + recvDataRequests_[rank][index]); + } + } + + // sends for data + receivingRow = myProcessRow; + receivingCol = myProcessCol; + + for( Int step=0; step 0 ) { - const Int destination = receivingRow + r*receivingCol; + // target rank + const Int destination = receivingRow + r*receivingCol; - T* XBuffer = Z.Buffer(); - // send data - const Int dindex = - NextIndexMatrix (destination, - numEntries, - XBuffer, - &matrix_index); + const Int index = + NextIndexData (numEntries, + sendData_[destination], + sendDataRequests_[destination], + sendDataStatuses_[destination]); DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexMatrix");) + (Int (sendData_[destination][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + + T *sendBuffer = sendData_[destination][index].data (); - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord[0] = i; coord[1] = j; - mpi::TaggedISend (coord, 2, destination, - COORD_ACC_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); + DATA_PUT_TAG, g.VCComm(), + sendDataRequests_[destination][index]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + receivingCol = (receivingCol + 1) % c; } - // progress my sends - TestMatrix ( Z ); - TestCoord ( Z ); - - // my receives - std::vector getVector; - bool flag = true; - - while ( flag ) + // wait for my data xfer to be over + for (Int i = 0; i < p; ++i) { - mpi::Status status; - flag = mpi::IProbe( mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm (), status); + // data receives + const Int numrecvDataRequests = recvDataRequests_[i].size (); + for (Int j = 0; j < numrecvDataRequests; ++j) + { + if (recvDataStatuses_[i][j]) + { + mpi::Wait (recvDataRequests_[i][j]); + recvDataStatuses_[i][j] = false; + } + } + // data sends + const Int numsendDataRequests = sendDataRequests_[i].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + { + if (sendDataStatuses_[i][j]) + { + mpi::Wait (sendDataRequests_[i][j]); + sendDataStatuses_[i][j] = false; + } + } + } + + // accumulate as data xfer is over + // there must be a way to get index + for ( int rank = 0; rank < p; ++rank ) + { + const int index = dataindices_[rank]; + const int i = recvCoord_[rank][index][0]; + const int j = recvCoord_[rank][index][1]; + const int numEntries = recvCoord_[rank][index][2]; - if ( flag ) + // data recv'd, now accumulate + if ( numEntries > 0 ) { - const Int source = status.MPI_SOURCE; - const Int count = mpi::GetCount (status); - - // post receive for coordinates - Int coord[2]; - mpi::TaggedRecv (coord, 2, source, - COORD_ACC_TAG, g.VCComm()); - Int i = coord[0]; Int j = coord[1]; - - // post recv for data - getVector.resize (count); - T *getBuffer = getVector.data (); - - mpi::TaggedRecv (getBuffer, count, source, - DATA_ACC_TAG, g.VCComm ()); - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); - + const T *Buffer = reinterpret_cast < const T * >(recvData_[rank][index].data()); + const Int colAlign = (Y.ColAlign () + i) % r; const Int rowAlign = (Y.RowAlign () + j) % c; @@ -1382,17 +1408,14 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) for (Int t = 0; t < localWidth; ++t) { T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; + const T *XCol = &Buffer[t * localHeight]; for (Int s = 0; s < localHeight; ++s) YCol[s] += XCol[s]; } } - } - - // wait for my sends - WaitMatrix ( Z ); - - getVector.clear(); + } + + dataindices_.clear(); } // detach collectively @@ -1420,8 +1443,20 @@ void AxpyInterface2::Detach() GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - matrices_.clear(); - coords_.clear(); + sendDataStatuses_.clear (); + sendCoordStatuses_.clear (); + recvDataStatuses_.clear (); + recvCoordStatuses_.clear (); + + sendDataRequests_.clear (); + sendCoordRequests_.clear (); + recvDataRequests_.clear (); + recvCoordRequests_.clear (); + + sendData_.clear (); + sendCoord_.clear (); + recvData_.clear (); + recvCoord_.clear (); } template class AxpyInterface2; @@ -1431,3 +1466,4 @@ template class AxpyInterface2>; template class AxpyInterface2>; } // namespace El +#endif diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 74863373c9..b658b2089c 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -400,6 +400,17 @@ void Translate // ================== #if MPI_VERSION>=3 +long ReadInc (Window & win, Aint offset, long inc, int fop_root) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::ReadInc")) + long otemp; + SafeMpi ( MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, fop_root, offset, MPI_SUM, + win) ); + SafeMpi ( MPI_Win_flush (fop_root, win) ); + + return otemp; +} + void SetWindowProp (Window & window, int prop) { DEBUG_ONLY (CallStackEntry cse ("mpi::SetWindowProp")) @@ -2064,6 +2075,62 @@ template void TaggedRecv (Complex < double >*buf, int count, int from, int tag, Comm comm); +template < typename R > +void TaggedRecvS (R * buf, int count, int from, + int tag, Comm comm, Status & status) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) + SafeMpi (MPI_Recv + (buf, count, TypeMap < R > (), from, tag, + comm.comm, &status)); +} + +template < typename R > +void TaggedRecvS (Complex < R > *buf, int count, + int from, int tag, Comm comm, Status & status) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi + (MPI_Recv + (buf, 2 * count, TypeMap < R > (), from, tag, + comm.comm, &status)); +#else + SafeMpi + (MPI_Recv + (buf, count, TypeMap < Complex < R >> (), from, + tag, comm.comm, &status)); +#endif +} + +template void TaggedRecvS (byte * buf, int count, int from, + int tag, Comm comm, Status & status); +template void TaggedRecvS (int *buf, int count, int from, + int tag, Comm comm, Status & status); +template void TaggedRecvS (unsigned *buf, int count, + int from, int tag, Comm comm, Status & status); +template void TaggedRecvS (long int *buf, int count, + int from, int tag, Comm comm, Status & status); +template void TaggedRecvS (unsigned long *buf, int count, + int from, int tag, Comm comm, Status & status); +#ifdef EL_HAVE_MPI_LONG_LONG +template void TaggedRecvS (long long int *buf, int count, + int from, int tag, Comm comm, Status & status); +template void TaggedRecvS (unsigned long long *buf, + int count, int from, int tag, + Comm comm, Status & status); +#endif +template void TaggedRecvS (float *buf, int count, int from, + int tag, Comm comm, Status & status); +template void TaggedRecvS (double *buf, int count, + int from, int tag, Comm comm, Status & status); +template void TaggedRecvS (Complex < float >*buf, + int count, int from, int tag, + Comm comm, Status & status); +template void TaggedRecvS (Complex < double >*buf, + int count, int from, int tag, + Comm comm, Status & status); + // matching recv template < typename R > void TaggedMrecv (R * buf, int count, Message & msg) From 8d28666c9dab01a7c6d4ec85bd30334657ff4f00 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 8 Oct 2014 11:19:27 -0500 Subject: [PATCH 087/110] fixing handle_ functions --- include/El/core/AxpyInterface2.0.hpp | 8 +- src/core/AxpyInterface2.0.cpp | 120 +++++++++++++++------------ 2 files changed, 70 insertions(+), 58 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 6d5a9950e5..00c40e70ff 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -95,12 +95,12 @@ class AxpyInterface2 // these are only used for nonblocking // update rountines void HandleGlobalToLocalData( Matrix& Z ); - void HandleLocalToGlobalData( Matrix& Z, Int count, Int source ); - void HandleLocalToGlobalAcc( Matrix& Z, Int count, Int source ); + void HandleLocalToGlobalData( Matrix& Z, Int source ); + void HandleLocalToGlobalAcc( Matrix& Z, Int source ); void HandleGlobalToLocalData( const Matrix& Z ); - void HandleLocalToGlobalData( const Matrix& Z, Int count, Int source ); - void HandleLocalToGlobalAcc( const Matrix& Z, Int count, Int source ); + void HandleLocalToGlobalData( const Matrix& Z, Int source ); + void HandleLocalToGlobalAcc( const Matrix& Z, Int source ); }; } // namespace El #endif // ifndef EL_AXPYINTERFACE2_HPP diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 41cee1f5bd..d846aee12e 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -407,8 +407,6 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - Int matrix_index, coord_index; - const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); @@ -432,7 +430,7 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const T* XBuffer = Z.LockedBuffer(); - + // send data const Int dindex = NextIndexData (numEntries, @@ -456,7 +454,7 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) mpi::TaggedISend (sendBuffer, numEntries, destination, DATA_ACC_TAG, g.VCComm(), sendDataRequests_[destination][dindex]); - + // send coordinates const Int cindex = NextIndexCoord (sendCoord_[destination], @@ -470,7 +468,7 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord, 3, destination, COORD_ACC_TAG, g.VCComm(), - sendCoordRequests_[destination][cindex]); + sendCoordRequests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; @@ -500,14 +498,12 @@ void AxpyInterface2::Flush( Matrix& Z ) mpi::Status status; if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm(), status) ) { - const Int count = mpi::GetCount (status); - HandleLocalToGlobalData ( Z, count, status.MPI_SOURCE ); + HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); } if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm(), status) ) { - const Int count = mpi::GetCount (status); - HandleLocalToGlobalAcc ( Z, count, status.MPI_SOURCE ); + HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); } if ( mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm(), status) ) @@ -518,53 +514,69 @@ void AxpyInterface2::Flush( Matrix& Z ) //test for completion for (Int i = 0; i < p; ++i) { - // data sends - const Int numsendDataRequests = sendDataRequests_[i].size (); - for (Int j = 0; j < numsendDataRequests; ++j) + // coord recvs + const Int numrecvCoordRequests = recvCoordRequests_[i].size (); + for (Int j = 0; j < numrecvCoordRequests; ++j) { - if (sendDataStatuses_[i][j]) - sendDataStatuses_[i][j] = !mpi::Test (sendDataRequests_[i][j]); - if (!sendDataStatuses_[i][j]) - _sendDataStatus = true; + if (recvCoordStatuses_[i][j]) + recvCoordStatuses_[i][j] = + !mpi::Test (recvCoordRequests_[i][j]); + if (!recvCoordStatuses_[i][j]) + _recvCoordStatus = true; else - _sendDataStatus = false; - } - + { + _recvCoordStatus = false; + break; + } + } + // coord sends const Int numsendCoordRequests = sendCoordRequests_[i].size (); for (Int j = 0; j < numsendCoordRequests; ++j) { if (sendCoordStatuses_[i][j]) - sendCoordStatuses_[i][j] = !mpi::Test (sendCoordRequests_[i][j]); + sendCoordStatuses_[i][j] = + !mpi::Test (sendCoordRequests_[i][j]); if (!sendCoordStatuses_[i][j]) _sendCoordStatus = true; else + { _sendCoordStatus = false; - } + break; + } + } // data recvs const Int numrecvDataRequests = recvDataRequests_[i].size (); for (Int j = 0; j < numrecvDataRequests; ++j) { if (recvDataStatuses_[i][j]) - recvDataStatuses_[i][j] = !mpi::Test (recvDataRequests_[i][j]); + recvDataStatuses_[i][j] = + !mpi::Test (recvDataRequests_[i][j]); if (!recvDataStatuses_[i][j]) _recvDataStatus = true; else + { _recvDataStatus = false; + break; + } } - - // coord sends - const Int numrecvCoordRequests = recvCoordRequests_[i].size (); - for (Int j = 0; j < numrecvCoordRequests; ++j) + + // data sends + const Int numsendDataRequests = sendDataRequests_[i].size (); + for (Int j = 0; j < numsendDataRequests; ++j) { - if (recvCoordStatuses_[i][j]) - recvCoordStatuses_[i][j] = !mpi::Test (recvCoordRequests_[i][j]); - if (!recvCoordStatuses_[i][j]) - _recvCoordStatus = true; + if (sendDataStatuses_[i][j]) + sendDataStatuses_[i][j] = + !mpi::Test (sendDataRequests_[i][j]); + if (!sendDataStatuses_[i][j]) + _sendDataStatus = true; else - _recvCoordStatus = false; - } + { + _sendDataStatus = false; + break; + } + } } if (_sendDataStatus && _sendCoordStatus && @@ -574,7 +586,7 @@ void AxpyInterface2::Flush( Matrix& Z ) } template < typename T > -void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int source ) +void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int source ) { DistMatrix &Y = *GlobalArrayPut_; const Grid & g = Y.Grid (); @@ -584,6 +596,15 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int s const Int myCol = g.Col (); int height = Z.Height(); int width = Z.Width(); + + // post receive for coordinates + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, + COORD_PUT_TAG, g.VCComm()); + Int i = coord[0]; + Int j = coord[1]; + Int count = coord[2]; + // data vector std::vector getVector_; getVector_.resize (count); @@ -593,13 +614,6 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int s DEBUG_ONLY (if (Int (getVector_.size ()) != count) LogicError ("Not enough space allocated");) - // post receive for coordinates - Int coord[3]; - mpi::TaggedRecv (coord, 3, source, - COORD_PUT_TAG, g.VCComm()); - Int i = coord[0]; - Int j = coord[1]; - // post receive for data T *getBuffer = getVector_.data(); mpi::TaggedRecv (getBuffer, count, source, @@ -630,7 +644,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int count, Int s // replica of above function except this accumulates template < typename T > -void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int source ) +void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int source ) { DistMatrix &Y = *GlobalArrayPut_; const Grid & g = Y.Grid (); @@ -641,6 +655,14 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so const int height = Z.Height(); const int width = Z.Width(); + // post receive for coordinates + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, + COORD_ACC_TAG, g.VCComm()); + Int i = coord[0]; + Int j = coord[1]; + Int count = coord[2]; + // data buffer std::vector getVector_; getVector_.resize (count); @@ -649,16 +671,8 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so LogicError ("Count was too small");) DEBUG_ONLY (if (Int (getVector_.size ()) != count) - LogicError ("Not enough space allocated");) - - // post receive for coordinates - // we don't need coord[2], i.e numEntries - Int coord[3]; - mpi::TaggedRecv (coord, 3, source, - COORD_ACC_TAG, g.VCComm()); - Int i = coord[0]; - Int j = coord[1]; - + LogicError ("Not enough space allocated");) + // post receive for data T *getBuffer = getVector_.data(); mpi::TaggedRecv (getBuffer, count, source, @@ -687,7 +701,7 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int count, Int so getVector_.clear(); } -// handle request for data, post a matching issend +// handle request for data, post a matching isend template < typename T > void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) { @@ -704,8 +718,6 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int myRow = g.Row(); const Int myCol = g.Col(); - Int matrix_index; - mpi::Status status; if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) From e93dbc64c59213fdd8ef3776e7f5872bd7c6f870 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 8 Oct 2014 19:59:50 -0500 Subject: [PATCH 088/110] fixed flush implementation...more testing required --- include/El/core/AxpyInterface2.0.hpp | 14 ++ src/core/AxpyInterface2.0.cpp | 221 +++++++++++++++++---------- 2 files changed, 151 insertions(+), 84 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 00c40e70ff..6da97c7bbf 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -7,6 +7,7 @@ #ifndef EL_AXPYINTERFACE2_HPP #define EL_AXPYINTERFACE2_HPP +#if MPI_VERSION>=3 namespace El { template class AxpyInterface2 @@ -80,6 +81,13 @@ class AxpyInterface2 bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; + // op count window for read increment + mpi::Window put_win_, acc_win_, + getrq_win_; + + long *put_win_base_, *acc_win_base_, + *getrq_win_base_; + // next index for data and coord Int NextIndexData ( Int dataSize, @@ -92,6 +100,11 @@ class AxpyInterface2 std::deque& requests, std::deque& requestStatuses ); + Int GetIndexData( Matrix& Z ); + Int GetIndexCoord( Matrix& Z ); + + bool TestRequests( Matrix& Z ); + // these are only used for nonblocking // update rountines void HandleGlobalToLocalData( Matrix& Z ); @@ -103,4 +116,5 @@ class AxpyInterface2 void HandleLocalToGlobalAcc( const Matrix& Z, Int source ); }; } // namespace El +#endif // MPI-3 #endif // ifndef EL_AXPYINTERFACE2_HPP diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index d846aee12e..a11dbde095 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -20,6 +20,8 @@ namespace El recvDataRequests_(0), recvCoordRequests_(0), sendData_(0), recvData_(0), sendCoord_(0), recvCoord_(0), + put_win_(0), acc_win_(0), getrq_win_(0), + put_win_base_(0), acc_win_base_(0), getrq_win_base_(0), toBeAttachedForGet_(false), toBeAttachedForPut_(false), attached_(false), detached_(false) { } @@ -56,6 +58,25 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) recvData_.resize (p); recvCoord_.resize (p); } + + // count window related + put_win_base_ = new long; + mpi::WindowCreate ( put_win_base_, sizeof(long), + g.VCComm(), put_win_ ); + memset (put_win_base_, 0, sizeof (long)); + mpi::WindowLock (put_win_); + + acc_win_base_ = new long; + mpi::WindowCreate ( acc_win_base_, sizeof(long), + g.VCComm(), acc_win_ ); + memset (acc_win_base_, 0, sizeof (long)); + mpi::WindowLock (acc_win_); + + getrq_win_base_ = new long; + mpi::WindowCreate ( getrq_win_base_, sizeof(long), + g.VCComm(), getrq_win_ ); + memset (getrq_win_base_, 0, sizeof (long)); + mpi::WindowLock (getrq_win_); } template @@ -123,6 +144,24 @@ void AxpyInterface2::Attach( DistMatrix& Z ) recvData_.resize (p); recvCoord_.resize (p); } + // count window related + put_win_base_ = new long; + mpi::WindowCreate ( put_win_base_, sizeof(long), + g.VCComm(), put_win_ ); + memset (put_win_base_, 0, sizeof (long)); + mpi::WindowLock (put_win_); + + acc_win_base_ = new long; + mpi::WindowCreate ( acc_win_base_, sizeof(long), + g.VCComm(), acc_win_ ); + memset (acc_win_base_, 0, sizeof (long)); + mpi::WindowLock (acc_win_); + + getrq_win_base_ = new long; + mpi::WindowCreate ( getrq_win_base_, sizeof(long), + g.VCComm(), getrq_win_ ); + memset (getrq_win_base_, 0, sizeof (long)); + mpi::WindowLock (getrq_win_); } } @@ -285,6 +324,9 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord, 3, destination, COORD_PUT_TAG, g.VCComm (), sendCoordRequests_[destination][cindex]); + + // put count + mpi::ReadInc (put_win_, 0, 1, destination); } receivingRow = (receivingRow + 1) % r; @@ -335,6 +377,8 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord, 3, rank, REQUEST_GET_TAG, g.VCComm (), sendCoordRequests_[rank][cindex]); + // get request count + mpi::ReadInc (getrq_win_, 0, 1, rank); } // Receive all of the replies @@ -343,6 +387,7 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) { mpi::Status status; HandleGlobalToLocalData ( Z ); + if (mpi::IProbe (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { @@ -468,7 +513,10 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord, 3, destination, COORD_ACC_TAG, g.VCComm(), - sendCoordRequests_[destination][cindex]); + sendCoordRequests_[destination][cindex]); + + // acc count + mpi::ReadInc (acc_win_, 0, 1, destination); } receivingRow = (receivingRow + 1) % r; @@ -477,10 +525,65 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) } } +template +bool AxpyInterface2::TestRequests( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::TestRequests")) + + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + const Int p = g.Size (); + + for (Int i = 0; i < p; ++i) + { + // coord recvs + const Int numrecvCoordRequests = recvCoordRequests_[i].size (); + for (Int j = 0; j < numrecvCoordRequests; ++j) + { + recvCoordStatuses_[i][j] = + !mpi::Test (recvCoordRequests_[i][j]); + if (recvCoordStatuses_[i][j]) + return false; + } + + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[i].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + { + sendCoordStatuses_[i][j] = + !mpi::Test (sendCoordRequests_[i][j]); + if (sendCoordStatuses_[i][j]) + return false; + } + + // data recvs + const Int numrecvDataRequests = recvDataRequests_[i].size (); + for (Int j = 0; j < numrecvDataRequests; ++j) + { + recvDataStatuses_[i][j] = + !mpi::Test (recvDataRequests_[i][j]); + if (recvDataStatuses_[i][j]) + return false; + } + + // data sends + const Int numsendDataRequests = sendDataRequests_[i].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + { + sendDataStatuses_[i][j] = + !mpi::Test (sendDataRequests_[i][j]); + if (sendDataStatuses_[i][j]) + return false; + } + } + + return true; +} + // flush ensures local and remote completion // this interface assumes a send has been issued // and will post a matching receive and progress -template + template void AxpyInterface2::Flush( Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) @@ -490,99 +593,36 @@ void AxpyInterface2::Flush( Matrix& Z ) DistMatrix& Y = *GlobalArrayPut_; const Grid& g = Y.Grid(); const Int p = g.Size (); - bool _sendDataStatus, _sendCoordStatus, - _recvDataStatus, _recvCoordStatus; + const Int me = g.VCRank(); + + // get my put/get/acc recv counts + const Int put_count = mpi::ReadInc (put_win_, 0, 0, me); + const Int acc_count = mpi::ReadInc (acc_win_, 0, 0, me); + const Int getrq_count = mpi::ReadInc (getrq_win_, 0, 0, me); + + TestRequests (Z); - while ( 1 ) + for (Int count = 0; count < put_count; ++count) { mpi::Status status; if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm(), status) ) - { HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); - } + } + for (Int count = 0; count < acc_count; ++count) + { + mpi::Status status; if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm(), status) ) - { HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); - } + } + for (Int count = 0; count < getrq_count; ++count) + { + mpi::Status status; if ( mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm(), status) ) - { - HandleGlobalToLocalData ( Z ); - } - - //test for completion - for (Int i = 0; i < p; ++i) - { - // coord recvs - const Int numrecvCoordRequests = recvCoordRequests_[i].size (); - for (Int j = 0; j < numrecvCoordRequests; ++j) - { - if (recvCoordStatuses_[i][j]) - recvCoordStatuses_[i][j] = - !mpi::Test (recvCoordRequests_[i][j]); - if (!recvCoordStatuses_[i][j]) - _recvCoordStatus = true; - else - { - _recvCoordStatus = false; - break; - } - } - - // coord sends - const Int numsendCoordRequests = sendCoordRequests_[i].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) - { - if (sendCoordStatuses_[i][j]) - sendCoordStatuses_[i][j] = - !mpi::Test (sendCoordRequests_[i][j]); - if (!sendCoordStatuses_[i][j]) - _sendCoordStatus = true; - else - { - _sendCoordStatus = false; - break; - } - } - - // data recvs - const Int numrecvDataRequests = recvDataRequests_[i].size (); - for (Int j = 0; j < numrecvDataRequests; ++j) - { - if (recvDataStatuses_[i][j]) - recvDataStatuses_[i][j] = - !mpi::Test (recvDataRequests_[i][j]); - if (!recvDataStatuses_[i][j]) - _recvDataStatus = true; - else - { - _recvDataStatus = false; - break; - } - } - - // data sends - const Int numsendDataRequests = sendDataRequests_[i].size (); - for (Int j = 0; j < numsendDataRequests; ++j) - { - if (sendDataStatuses_[i][j]) - sendDataStatuses_[i][j] = - !mpi::Test (sendDataRequests_[i][j]); - if (!sendDataStatuses_[i][j]) - _sendDataStatus = true; - else - { - _sendDataStatus = false; - break; - } - } - } - - if (_sendDataStatus && _sendCoordStatus && - _recvDataStatus && _recvCoordStatus) - break; + HandleGlobalToLocalData ( Z ); } + } template < typename T > @@ -1469,6 +1509,19 @@ void AxpyInterface2::Detach() sendCoord_.clear (); recvData_.clear (); recvCoord_.clear (); + + mpi::WindowUnlock (put_win_); + mpi::WindowFree (put_win_); + + mpi::WindowUnlock (acc_win_); + mpi::WindowFree (acc_win_); + + mpi::WindowUnlock (getrq_win_); + mpi::WindowFree (getrq_win_); + + delete put_win_base_; + delete acc_win_base_; + delete getrq_win_base_; } template class AxpyInterface2; From 9c22a83c89e16766fef756375a0757f0e48207b3 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 9 Oct 2014 11:30:22 -0500 Subject: [PATCH 089/110] modifying Get --- src/core/AxpyInterface2.0.cpp | 175 +++++++++++++++++++++++++--------- 1 file changed, 132 insertions(+), 43 deletions(-) diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index a11dbde095..f654199829 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -770,8 +770,8 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) REQUEST_GET_TAG, g.VCComm()); Int i = coord[0]; Int j = coord[1]; - // we calculate numEntries anyway, so - // coord[2] is not required + // we need the localwidth/height here, + // used also to calculate numEntries const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; @@ -1106,66 +1106,137 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) const Int p = g.Size (); const Int XLDim = Z.LDim(); - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); + + std::vector dataindices_; + dataindices_.resize (p); - Int receivingRow = g.Row(); - Int receivingCol = g.Col(); - - for( Int step=0; step recvVector_; // Receive all of the replies - while ( 1 ) + Int numReplies = 0; + while ( numReplies < p ) { mpi::Status status; if (mpi::IProbe @@ -1202,9 +1273,27 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) for (Int s = 0; s < localHeight; ++s) YCol[colShift + s * r] = XCol[s]; } + ++numReplies; } recvVector_.clear(); } + // wait for my data xfer to be over + for (Int i = 0; i < p; ++i) + { + // data sends + const Int numsendDataRequests = sendDataRequests_[i].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + { + if (sendDataStatuses_[i][j]) + { + mpi::Wait (sendDataRequests_[i][j]); + sendDataStatuses_[i][j] = false; + } + } + } + + dataindices_.clear(); + } // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, From 628f6b02d1b5eedf1618b78eccc6bb8b5b544a8e Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Fri, 10 Oct 2014 01:41:35 -0500 Subject: [PATCH 090/110] fixed get --- include/El/core/AxpyInterface2.0.hpp | 9 +-- src/core/AxpyInterface2.0.cpp | 107 ++++++++++++++++----------- 2 files changed, 69 insertions(+), 47 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 6da97c7bbf..9196ecc90e 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -82,11 +82,8 @@ class AxpyInterface2 attached_, detached_; // op count window for read increment - mpi::Window put_win_, acc_win_, - getrq_win_; - - long *put_win_base_, *acc_win_base_, - *getrq_win_base_; + mpi::Window put_win_, acc_win_; + long *put_win_base_, *acc_win_base_; // next index for data and coord Int NextIndexData @@ -100,10 +97,12 @@ class AxpyInterface2 std::deque& requests, std::deque& requestStatuses ); + // TODO Int GetIndexData( Matrix& Z ); Int GetIndexCoord( Matrix& Z ); bool TestRequests( Matrix& Z ); + void WaitRequests( Matrix& Z ); // these are only used for nonblocking // update rountines diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index f654199829..a0047f19ef 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -7,7 +7,6 @@ which can be found in the LICENSE file in the root directory, or at #include #if MPI_VERSION>=3 -// TODO Use DDT for put/get/acc when EL_USE_DERIVED_TYPE is defined // TODO bring back const interfaces namespace El { @@ -20,8 +19,8 @@ namespace El recvDataRequests_(0), recvCoordRequests_(0), sendData_(0), recvData_(0), sendCoord_(0), recvCoord_(0), - put_win_(0), acc_win_(0), getrq_win_(0), - put_win_base_(0), acc_win_base_(0), getrq_win_base_(0), + put_win_(0), acc_win_(0), + put_win_base_(0), acc_win_base_(0), toBeAttachedForGet_(false), toBeAttachedForPut_(false), attached_(false), detached_(false) { } @@ -71,12 +70,6 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) g.VCComm(), acc_win_ ); memset (acc_win_base_, 0, sizeof (long)); mpi::WindowLock (acc_win_); - - getrq_win_base_ = new long; - mpi::WindowCreate ( getrq_win_base_, sizeof(long), - g.VCComm(), getrq_win_ ); - memset (getrq_win_base_, 0, sizeof (long)); - mpi::WindowLock (getrq_win_); } template @@ -156,12 +149,6 @@ void AxpyInterface2::Attach( DistMatrix& Z ) g.VCComm(), acc_win_ ); memset (acc_win_base_, 0, sizeof (long)); mpi::WindowLock (acc_win_); - - getrq_win_base_ = new long; - mpi::WindowCreate ( getrq_win_base_, sizeof(long), - g.VCComm(), getrq_win_ ); - memset (getrq_win_base_, 0, sizeof (long)); - mpi::WindowLock (getrq_win_); } } @@ -350,7 +337,7 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) const Int width = Z.Width (); if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid AxpyGlobalToLocal submatrix"); + LogicError ("Invalid submatrix for Get"); const Grid & g = X.Grid (); const Int r = g.Height (); @@ -359,7 +346,6 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) std::vector recvVector_; - const T* XBuffer = Z.LockedBuffer(); // Send out the requests to all processes in the grid for (Int rank = 0; rank < p; ++rank) { @@ -369,7 +355,7 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) sendCoordRequests_[rank], sendCoordStatuses_[rank]); - Int *coord = reinterpret_cast(sendCoord_[rank][cindex].data ()); + Int *coord = sendCoord_[rank][cindex].data (); coord[0] = i; coord[1] = j; coord[2] = -1; @@ -377,8 +363,6 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord, 3, rank, REQUEST_GET_TAG, g.VCComm (), sendCoordRequests_[rank][cindex]); - // get request count - mpi::ReadInc (getrq_win_, 0, 1, rank); } // Receive all of the replies @@ -414,15 +398,16 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) // Unpack the local matrix for (Int t = 0; t < localWidth; ++t) { - T *YCol = X.Buffer (0, rowShift + t * c); + T *YCol = Z.Buffer (0, rowShift + t * c); const T *XCol = &recvBuffer[t * localHeight]; for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] = XCol[s]; - } + YCol[colShift + s * r] = XCol[s]; + } + ++numReplies; - recvVector_.clear(); } } + recvVector_.clear(); } // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, @@ -525,6 +510,55 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) } } +template +void AxpyInterface2::WaitRequests( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::WaitRequests")) + + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + const Int p = g.Size (); + + for (Int i = 0; i < p; ++i) + { + // coord recvs + const Int numrecvCoordRequests = recvCoordRequests_[i].size (); + for (Int j = 0; j < numrecvCoordRequests; ++j) + { + if (recvCoordStatuses_[i][j]) + mpi::Wait (recvCoordRequests_[i][j]); + recvCoordStatuses_[i][j] = false; + } + + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[i].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + { + if (sendCoordStatuses_[i][j]) + mpi::Wait (sendCoordRequests_[i][j]); + sendCoordStatuses_[i][j] = false; + } + + // data recvs + const Int numrecvDataRequests = recvDataRequests_[i].size (); + for (Int j = 0; j < numrecvDataRequests; ++j) + { + if (recvDataStatuses_[i][j]) + mpi::Wait (recvDataRequests_[i][j]); + recvDataStatuses_[i][j] = false; + } + + // data sends + const Int numsendDataRequests = sendDataRequests_[i].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + { + if (sendDataStatuses_[i][j]) + mpi::Wait (sendDataRequests_[i][j]); + sendDataStatuses_[i][j] = false; + } + } +} + template bool AxpyInterface2::TestRequests( Matrix& Z ) { @@ -598,10 +632,7 @@ void AxpyInterface2::Flush( Matrix& Z ) // get my put/get/acc recv counts const Int put_count = mpi::ReadInc (put_win_, 0, 0, me); const Int acc_count = mpi::ReadInc (acc_win_, 0, 0, me); - const Int getrq_count = mpi::ReadInc (getrq_win_, 0, 0, me); - TestRequests (Z); - for (Int count = 0; count < put_count; ++count) { mpi::Status status; @@ -616,13 +647,11 @@ void AxpyInterface2::Flush( Matrix& Z ) HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); } - for (Int count = 0; count < getrq_count; ++count) - { - mpi::Status status; - if ( mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm(), status) ) - HandleGlobalToLocalData ( Z ); - } + for (Int count = 0; count < p; ++count) + HandleGlobalToLocalData ( Z ); + // wait for all requests - coords and data + WaitRequests (Z); } template < typename T > @@ -794,7 +823,6 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) DEBUG_ONLY (if (numEntries < Int (sizeof (T))) LogicError ("Count was too small");) - T* XBuffer = Z.Buffer(); const Int index = NextIndexData (numEntries, sendData_[source], @@ -1268,14 +1296,13 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // Unpack the local matrix for (Int t = 0; t < localWidth; ++t) { - T *YCol = X.Buffer (0, rowShift + t * c); + T *YCol = Z.Buffer (0, rowShift + t * c); const T *XCol = &recvBuffer[t * localHeight]; for (Int s = 0; s < localHeight; ++s) YCol[colShift + s * r] = XCol[s]; } ++numReplies; } - recvVector_.clear(); } // wait for my data xfer to be over for (Int i = 0; i < p; ++i) @@ -1292,8 +1319,8 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) } } - dataindices_.clear(); - + dataindices_.clear(); + recvVector_.clear(); } // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, @@ -1605,12 +1632,8 @@ void AxpyInterface2::Detach() mpi::WindowUnlock (acc_win_); mpi::WindowFree (acc_win_); - mpi::WindowUnlock (getrq_win_); - mpi::WindowFree (getrq_win_); - delete put_win_base_; delete acc_win_base_; - delete getrq_win_base_; } template class AxpyInterface2; From 048875b13971dfcfe3447c217181509c249e9f07 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Thu, 30 Oct 2014 21:47:05 -0500 Subject: [PATCH 091/110] fixed Iget, Get and Flush (for Ig/p/a) --- include/El/core/AxpyInterface2.0.hpp | 5 +- src/core/AxpyInterface2.0.cpp | 516 +++++++++++++-------------- 2 files changed, 250 insertions(+), 271 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 9196ecc90e..0e5e6caa74 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -82,8 +82,9 @@ class AxpyInterface2 attached_, detached_; // op count window for read increment - mpi::Window put_win_, acc_win_; - long *put_win_base_, *acc_win_base_; + mpi::Window put_win_, acc_win_, getrq_win_; + long *put_win_base_, *acc_win_base_, + *getrq_win_base_; // next index for data and coord Int NextIndexData diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index a0047f19ef..309ec92196 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -19,8 +19,8 @@ namespace El recvDataRequests_(0), recvCoordRequests_(0), sendData_(0), recvData_(0), sendCoord_(0), recvCoord_(0), - put_win_(0), acc_win_(0), - put_win_base_(0), acc_win_base_(0), + put_win_(0), acc_win_(0), getrq_win_(0), + put_win_base_(0), acc_win_base_(0), getrq_win_base_(0), toBeAttachedForGet_(false), toBeAttachedForPut_(false), attached_(false), detached_(false) { } @@ -70,6 +70,12 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) g.VCComm(), acc_win_ ); memset (acc_win_base_, 0, sizeof (long)); mpi::WindowLock (acc_win_); + + getrq_win_base_ = new long; + mpi::WindowCreate ( getrq_win_base_, sizeof(long), + g.VCComm(), getrq_win_ ); + memset (getrq_win_base_, 0, sizeof (long)); + mpi::WindowLock (getrq_win_); } template @@ -149,6 +155,12 @@ void AxpyInterface2::Attach( DistMatrix& Z ) g.VCComm(), acc_win_ ); memset (acc_win_base_, 0, sizeof (long)); mpi::WindowLock (acc_win_); + + getrq_win_base_ = new long; + mpi::WindowCreate ( getrq_win_base_, sizeof(long), + g.VCComm(), getrq_win_ ); + memset (getrq_win_base_, 0, sizeof (long)); + mpi::WindowLock (getrq_win_); } } @@ -337,15 +349,11 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) const Int width = Z.Width (); if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid submatrix for Get"); + LogicError ("Invalid submatrix for Iget"); const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); const Int p = g.Size (); - std::vector recvVector_; - // Send out the requests to all processes in the grid for (Int rank = 0; rank < p; ++rank) { @@ -359,55 +367,14 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) coord[0] = i; coord[1] = j; coord[2] = -1; - + mpi::TaggedISend (coord, 3, rank, REQUEST_GET_TAG, g.VCComm (), - sendCoordRequests_[rank][cindex]); - } - - // Receive all of the replies - Int numReplies = 0; - while (numReplies < p) - { - mpi::Status status; - HandleGlobalToLocalData ( Z ); - - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount (status); - recvVector_.resize (count); - T *recvBuffer = recvVector_.data (); - - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); - - // Compute the local heights and offsets - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Z.Buffer (0, rowShift + t * c); - const T *XCol = &recvBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] = XCol[s]; - } - - ++numReplies; - } + sendCoordRequests_[rank][cindex]); + + // modify get count + mpi::ReadInc (getrq_win_, 0, 1, rank); } - recvVector_.clear(); } // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, @@ -624,15 +591,21 @@ void AxpyInterface2::Flush( Matrix& Z ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); + const Int height = Z.Height (); + const Int width = Z.Width (); + DistMatrix& Y = *GlobalArrayPut_; const Grid& g = Y.Grid(); const Int p = g.Size (); const Int me = g.VCRank(); - + const Int r = g.Height (); + const Int c = g.Width (); + // get my put/get/acc recv counts const Int put_count = mpi::ReadInc (put_win_, 0, 0, me); const Int acc_count = mpi::ReadInc (acc_win_, 0, 0, me); - + const Int getrq_count = mpi::ReadInc (getrq_win_, 0, 0, me); + for (Int count = 0; count < put_count; ++count) { mpi::Status status; @@ -647,9 +620,9 @@ void AxpyInterface2::Flush( Matrix& Z ) HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); } - for (Int count = 0; count < p; ++count) - HandleGlobalToLocalData ( Z ); - + for (Int count = 0; count < getrq_count; ++count) + HandleGlobalToLocalData ( Z ); + // wait for all requests - coords and data WaitRequests (Z); } @@ -787,68 +760,108 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int myRow = g.Row(); const Int myCol = g.Col(); - mpi::Status status; + Int i, j; + std::vector recvVector_; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); - if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) + for (Int step = 0; step < p; step++) { - const Int source = status.MPI_SOURCE; + mpi::Status status; + if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) + { + const Int source = status.MPI_SOURCE; + // post receive for coordinates + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, + REQUEST_GET_TAG, g.VCComm()); + i = coord[0]; + j = coord[1]; - // post receive for coordinates - Int coord[3]; - mpi::TaggedRecv (coord, 3, source, - REQUEST_GET_TAG, g.VCComm()); - Int i = coord[0]; - Int j = coord[1]; - // we need the localwidth/height here, - // used also to calculate numEntries - - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; + // we need the localwidth/height here, + // used also to calculate numEntries - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); - const Int numEntries = localHeight * localWidth; + const Int numEntries = localHeight * localWidth; - DEBUG_ONLY (if (numEntries < Int (sizeof (T))) - LogicError ("Count was too small");) + DEBUG_ONLY (if (numEntries < Int (sizeof (T))) + LogicError ("Count was too small");) - const Int index = - NextIndexData (numEntries, - sendData_[source], - sendDataRequests_[source], - sendDataStatuses_[source]); - - DEBUG_ONLY (if - (Int (sendData_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - - T *replyBuffer = sendData_[source][index].data (); - - for (Int t = 0; t < localWidth; ++t) - { - T *sendCol = &replyBuffer[t * localHeight]; - const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); - MemCopy (sendCol, XCol, localHeight); + const Int index = + NextIndexData (numEntries, + sendData_[source], + sendDataRequests_[source], + sendDataStatuses_[source]); + + DEBUG_ONLY (if + (Int (sendData_[source][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + + T *replyBuffer = sendData_[source][index].data (); + + for (Int t = 0; t < localWidth; ++t) + { + T *sendCol = &replyBuffer[t * localHeight]; + const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); + MemCopy (sendCol, XCol, localHeight); + } + + // Fire off non-blocking send + mpi::TaggedISend (replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm (), + sendDataRequests_[source][index]); } - // Fire off non-blocking send - mpi::TaggedISend (replyBuffer, numEntries, source, - DATA_GET_TAG, g.VCComm (), - sendDataRequests_[source][index]); + // receive data + if (mpi::IProbe + (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) + { + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount (status); + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + + // Compute the local heights and offsets + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Z.Buffer (0, rowShift + t * c); + const T *XCol = &recvBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] = XCol[s]; + } + } } + recvVector_.clear(); } - + // blocking update routines template void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) @@ -1120,193 +1133,155 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // the DistMatrix isn't attached if ( !toBeAttachedForGet_ ) LogicError ("Cannot perform this operation as matrix is not attached."); - DistMatrix& X = *GlobalArrayGet_; - - const Int height = Z.Height (); - const Int width = Z.Width (); - - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid AxpyGlobalToLocal submatrix"); - const Grid & g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - - const Int XLDim = Z.LDim(); - - std::vector dataindices_; - dataindices_.resize (p); - - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); - const T* XBuffer = Z.LockedBuffer(); - - // prepost receives for coordinates - for ( Int rank = 0; rank < p; ++rank ) - { - const Int index = - NextIndexCoord (recvCoord_[rank], - recvCoordRequests_[rank], - recvCoordStatuses_[rank]); - - dataindices_[rank] = index; - Int *coord_ = recvCoord_[rank][index].data(); - mpi::TaggedIRecv (coord_, 3, rank, COORD_PUT_TAG, g.VCComm(), - recvCoordRequests_[rank][index]); - } - - // send coordinates - for( Int rank=0; rank recvVector_; + + const Int getrq_count = mpi::ReadInc (getrq_win_, 0, 0, g.VCRank()); + + for (Int count = 0; count < getrq_count; ++count) { - // coord receives - const Int numRecvCoordRequests = recvCoordRequests_[i].size (); - for (Int j = 0; j < numRecvCoordRequests; ++j) - { - if (recvCoordStatuses_[i][j]) - { - mpi::Wait (recvCoordRequests_[i][j]); - recvCoordStatuses_[i][j] = false; - } - } - // coord sends - const Int numSendCoordRequests = sendCoordRequests_[i].size (); - for (Int j = 0; j < numSendCoordRequests; ++j) + for (Int step = 0; step < p; step++) { - if (sendCoordStatuses_[i][j]) + mpi::Status status; + if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) { - mpi::Wait (recvCoordRequests_[i][j]); - sendCoordStatuses_[i][j] = false; - } - } - } - - // exchange data - // data sends - for( Int source=0; source recvVector_; + DEBUG_ONLY (if + (Int (sendData_[source][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) - // Receive all of the replies - Int numReplies = 0; - while ( numReplies < p ) - { - mpi::Status status; - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount (status); - recvVector_.resize (count); - T *recvBuffer = recvVector_.data (); + T *replyBuffer = sendData_[source][index].data (); - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + for (Int t = 0; t < localWidth; ++t) + { + T *sendCol = &replyBuffer[t * localHeight]; + const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); + MemCopy (sendCol, XCol, localHeight); + } - // Compute the local heights and offsets - const Int myRow = g.Row (); - const Int myCol = g.Col (); - - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); + // Fire off non-blocking send + mpi::TaggedISend (replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm (), + sendDataRequests_[source][index]); + } - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) + // receive data + if (mpi::IProbe + (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { - T *YCol = Z.Buffer (0, rowShift + t * c); - const T *XCol = &recvBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] = XCol[s]; + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount (status); + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + + // Compute the local heights and offsets + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Z.Buffer (0, rowShift + t * c); + const T *XCol = &recvBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] = XCol[s]; + } } - ++numReplies; } } - // wait for my data xfer to be over + + // wait for my data/coord xfer to be over for (Int i = 0; i < p; ++i) { + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[i].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + { + if (sendCoordStatuses_[i][j]) + { + mpi::Wait (sendCoordRequests_[i][j]); + sendCoordStatuses_[i][j] = false; + } + } + // data sends const Int numsendDataRequests = sendDataRequests_[i].size (); for (Int j = 0; j < numsendDataRequests; ++j) @@ -1317,9 +1292,8 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) sendDataStatuses_[i][j] = false; } } - } - - dataindices_.clear(); + } + recvVector_.clear(); } @@ -1631,9 +1605,13 @@ void AxpyInterface2::Detach() mpi::WindowUnlock (acc_win_); mpi::WindowFree (acc_win_); - + + mpi::WindowUnlock (getrq_win_); + mpi::WindowFree (getrq_win_); + delete put_win_base_; delete acc_win_base_; + delete getrq_win_base_; } template class AxpyInterface2; From ce206e90a3c0e234cfc7b191dcbdec4a54d981e5 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Sat, 8 Nov 2014 02:36:18 -0600 Subject: [PATCH 092/110] modified blocking interfaces --- src/core/AxpyInterface2.0.cpp | 705 ++++++++++++++++++---------------- 1 file changed, 365 insertions(+), 340 deletions(-) diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 309ec92196..4675f1713b 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -22,7 +22,7 @@ namespace El put_win_(0), acc_win_(0), getrq_win_(0), put_win_base_(0), acc_win_base_(0), getrq_win_base_(0), toBeAttachedForGet_(false), toBeAttachedForPut_(false), - attached_(false), detached_(false) + attached_(false), detached_(false) { } template @@ -321,7 +321,8 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) coord[1] = j; coord[2] = numEntries; - mpi::TaggedISend (coord, 3, destination, COORD_PUT_TAG, g.VCComm (), + mpi::TaggedISend (coord, 3, destination, + COORD_PUT_TAG, g.VCComm (), sendCoordRequests_[destination][cindex]); // put count @@ -625,6 +626,11 @@ void AxpyInterface2::Flush( Matrix& Z ) // wait for all requests - coords and data WaitRequests (Z); + + // flush counts + mpi::ReadInc (put_win_, 0, -put_count, me); + mpi::ReadInc (acc_win_, 0, -acc_count, me); + mpi::ReadInc (getrq_win_, 0, -getrq_count, me); } template < typename T > @@ -677,8 +683,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int source ) { T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] = XCol[s]; + MemCopy (YCol, XCol, localHeight); } // Free the memory getVector_.clear(); @@ -890,129 +895,20 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) const Int c = g.Width(); const Int p = g.Size(); - std::vector dataindices_; - dataindices_.resize (p); - const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); const T* XBuffer = Z.LockedBuffer(); - - // prepost receives for coordinates - for ( int rank = 0; rank < p; ++rank ) - { - const Int index = - NextIndexCoord (recvCoord_[rank], - recvCoordRequests_[rank], - recvCoordStatuses_[rank]); - - dataindices_[rank] = index; - Int *coord_ = recvCoord_[rank][index].data(); - mpi::TaggedIRecv (coord_, 3, rank, COORD_PUT_TAG, g.VCComm(), - recvCoordRequests_[rank][index]); - } - + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - const Int YLDim = Y.LDim(); - - // send coordinates and data size - for( Int step=0; step 0 ) - { - const Int index = - NextIndexData (numEntries, - recvData_[rank], - recvDataRequests_[rank], - recvDataStatuses_[rank]); - - DEBUG_ONLY (if - (Int (recvData_[rank][index].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *recvData = recvData_[rank][index].data (); - - mpi::TaggedIRecv (recvData, numEntries, rank, - DATA_PUT_TAG, g.VCComm (), - recvDataRequests_[rank][index]); - } - } + const Int YLDim = Y.LDim(); - // sends for data - receivingRow = myProcessRow; - receivingCol = myProcessCol; - + // data/coord send for( Int step=0; step::Put( Matrix& Z, Int i, Int j ) const Int numEntries = localHeight * localWidth; - // send data if( numEntries > 0 ) { - // target rank const Int destination = receivingRow + r*receivingCol; - + // data const Int index = NextIndexData (numEntries, sendData_[destination], @@ -1052,78 +946,303 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) mpi::TaggedISend (sendBuffer, numEntries, destination, DATA_PUT_TAG, g.VCComm(), sendDataRequests_[destination][index]); + + // coordinates + const Int cindex = + NextIndexCoord (sendCoord_[destination], + sendCoordRequests_[destination], + sendCoordStatuses_[destination]); + + Int * coord_ = sendCoord_[destination][cindex].data (); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + + // post receive for coordinates + mpi::TaggedISend (coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + sendCoordRequests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } + - // wait for my data xfer to be over - for (Int i = 0; i < p; ++i) + // test for requests + for (Int rank = 0; rank < p; ++rank) { - // data receives - const Int numrecvDataRequests = recvDataRequests_[i].size (); - for (Int j = 0; j < numrecvDataRequests; ++j) + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[rank].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + sendCoordStatuses_[rank][j] = + !mpi::Test (sendCoordRequests_[rank][j]); + // data sends + const Int numsendDataRequests = sendDataRequests_[rank].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + sendDataStatuses_[rank][j] = + !mpi::Test (sendDataRequests_[rank][j]); + } + + // data/coord receive + std::vector recvVector_; + for ( int step = 0; step < p; ++step ) + { + mpi::Status status; + if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm(), status) ) { - if (recvDataStatuses_[i][j]) + const Int source = status.MPI_SOURCE; + // coordinates + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, + COORD_PUT_TAG, g.VCComm()); + + Int i = coord[0]; + Int j = coord[1]; + Int count = coord[2]; + + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // data + mpi::TaggedRecv (recvBuffer, count, source, + DATA_PUT_TAG, g.VCComm ()); + + // Update Y + const T *XBuffer = recvBuffer; + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + + const Int colShift = Shift (g.Row(), colAlign, r); + const Int rowShift = Shift (g.Col(), rowAlign, c); + + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); + + for (Int t = 0; t < localWidth; ++t) { - mpi::Wait (recvDataRequests_[i][j]); - recvDataStatuses_[i][j] = false; - } + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + MemCopy (YCol, XCol, localHeight); + } + } + } + + // wait for requests + for (Int rank = 0; rank < p; ++rank) + { + // coord sends + const Int numSendCoordRequests = sendCoordRequests_[rank].size (); + for (Int j = 0; j < numSendCoordRequests; ++j) + { + if (sendCoordStatuses_[rank][j]) + { + mpi::Wait (sendCoordRequests_[rank][j]); + sendCoordStatuses_[rank][j] = false; + } } // data sends - const Int numsendDataRequests = sendDataRequests_[i].size (); + const Int numsendDataRequests = sendDataRequests_[rank].size (); for (Int j = 0; j < numsendDataRequests; ++j) { - if (sendDataStatuses_[i][j]) + if (sendDataStatuses_[rank][j]) { - mpi::Wait (sendDataRequests_[i][j]); - sendDataStatuses_[i][j] = false; - } + mpi::Wait (sendDataRequests_[rank][j]); + sendDataStatuses_[rank][j] = false; + } } - } - - // accumulate as data xfer is over - // there must be a way to get index - for ( int rank = 0; rank < p; ++rank ) + } + + recvVector_.clear(); +} + +template +void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + std::vector recvVector_; + + // Send out the get requests to everyone in grid + for (Int rank = 0; rank < p; ++rank) { - const int index = dataindices_[rank]; - const int i = recvCoord_[rank][index][0]; - const int j = recvCoord_[rank][index][1]; - const int numEntries = recvCoord_[rank][index][2]; + // send coordinates + const Int cindex = + NextIndexCoord (sendCoord_[rank], + sendCoordRequests_[rank], + sendCoordStatuses_[rank]); + + Int *coord = sendCoord_[rank][cindex].data (); + coord[0] = i; + coord[1] = j; + coord[2] = -1; - // data recv'd, now accumulate - if ( numEntries > 0 ) + mpi::TaggedISend (coord, 3, rank, + REQUEST_GET_TAG, g.VCComm (), + sendCoordRequests_[rank][cindex]); + } + + // test for requests + for (Int rank = 0; rank < p; ++rank) + { + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[rank].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + sendCoordStatuses_[rank][j] = + !mpi::Test (sendCoordRequests_[rank][j]); + } + + for ( Int step = 0; step < p; ++step ) + { + mpi::Status status; + if (mpi::IProbe + (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) { - // Update Y - const T *Buffer = reinterpret_cast < const T * >(recvData_[rank][index].data()); - - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; + const Int source = status.MPI_SOURCE; + // post receive for coordinates + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, + REQUEST_GET_TAG, g.VCComm()); + Int i = coord[0]; + Int j = coord[1]; - const Int colShift = Shift (g.Row(), colAlign, r); - const Int rowShift = Shift (g.Col(), rowAlign, c); + // we need the localwidth/height here, + // used also to calculate numEntries + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); const Int localHeight = Length (height, colShift, r); const Int localWidth = Length (width, rowShift, c); const Int iLocalOffset = Length (i, Y.ColShift (), r); const Int jLocalOffset = Length (j, Y.RowShift (), c); + const Int numEntries = localHeight * localWidth; + + DEBUG_ONLY (if (numEntries < Int (sizeof (T))) + LogicError ("Count was too small");) + + const Int index = + NextIndexData (numEntries, + sendData_[source], + sendDataRequests_[source], + sendDataStatuses_[source]); + + DEBUG_ONLY (if + (Int (sendData_[source][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + + T *replyBuffer = sendData_[source][index].data (); for (Int t = 0; t < localWidth; ++t) { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &Buffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] = XCol[s]; + T *sendCol = &replyBuffer[t * localHeight]; + const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); + MemCopy (sendCol, XCol, localHeight); + } + + // Fire off non-blocking send + mpi::TaggedISend (replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm (), + sendDataRequests_[source][index]); + } + } + + // wait/test for send requests + for (Int rank = 0; rank < p; ++rank) + { + // data sends + const Int numsendDataRequests = sendDataRequests_[rank].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + sendDataStatuses_[rank][j] = + !mpi::Test (sendDataRequests_[rank][j]); + // coord sends + const Int numSendCoordRequests = sendCoordRequests_[rank].size (); + for (Int j = 0; j < numSendCoordRequests; ++j) + { + if (sendCoordStatuses_[rank][j]) + { + mpi::Wait (sendCoordRequests_[rank][j]); + sendCoordStatuses_[rank][j] = false; } } - } + } + + // receive data + for (Int step = 0; step < p; ++step) + { + mpi::Status status; + if (mpi::IProbe + (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) + { + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount (status); + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + + // Compute the local heights and offsets + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); - dataindices_.clear(); + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Z.Buffer (0, rowShift + t * c); + const T *XCol = &recvBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] = XCol[s]; + } + } + } + + // wait for send requests + for (Int rank = 0; rank < p; ++rank) + { + // data sends + const Int numsendDataRequests = sendDataRequests_[rank].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + { + if (sendDataStatuses_[rank][j]) + { + mpi::Wait (sendDataRequests_[rank][j]); + sendDataStatuses_[rank][j] = false; + } + } + } } +/* template void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { @@ -1267,39 +1386,12 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) } } } - - // wait for my data/coord xfer to be over - for (Int i = 0; i < p; ++i) - { - // coord sends - const Int numsendCoordRequests = sendCoordRequests_[i].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) - { - if (sendCoordStatuses_[i][j]) - { - mpi::Wait (sendCoordRequests_[i][j]); - sendCoordStatuses_[i][j] = false; - } - } - - // data sends - const Int numsendDataRequests = sendDataRequests_[i].size (); - for (Int j = 0; j < numsendDataRequests; ++j) - { - if (sendDataStatuses_[i][j]) - { - mpi::Wait (sendDataRequests_[i][j]); - sendDataStatuses_[i][j] = false; - } - } - } - - recvVector_.clear(); } +*/ // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width -template + template void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) @@ -1326,129 +1418,20 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int c = g.Width(); const Int p = g.Size(); - std::vector dataindices_; - dataindices_.resize (p); - const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); const T* XBuffer = Z.LockedBuffer(); - - // prepost receives for coordinates - for ( int rank = 0; rank < p; ++rank ) - { - const Int index = - NextIndexCoord (recvCoord_[rank], - recvCoordRequests_[rank], - recvCoordStatuses_[rank]); - - dataindices_[rank] = index; - Int *coord_ = recvCoord_[rank][index].data(); - mpi::TaggedIRecv (coord_, 3, rank, COORD_PUT_TAG, g.VCComm(), - recvCoordRequests_[rank][index]); - } - + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; - const Int YLDim = Y.LDim(); - - // send coordinates and data size - for( Int step=0; step 0 ) - { - const Int index = - NextIndexData (numEntries, - recvData_[rank], - recvDataRequests_[rank], - recvDataStatuses_[rank]); - - DEBUG_ONLY (if - (Int (recvData_[rank][index].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *recvData = recvData_[rank][index].data (); - - mpi::TaggedIRecv (recvData, numEntries, rank, - DATA_PUT_TAG, g.VCComm (), - recvDataRequests_[rank][index]); - } - } + const Int YLDim = Y.LDim(); - // sends for data - receivingRow = myProcessRow; - receivingCol = myProcessCol; - + // data/coord send for( Int step=0; step::Acc( Matrix& Z, Int i, Int j ) const Int numEntries = localHeight * localWidth; - // send data if( numEntries > 0 ) { - // target rank const Int destination = receivingRow + r*receivingCol; - + // data const Int index = NextIndexData (numEntries, sendData_[destination], @@ -1475,7 +1456,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) (Int (sendData_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = sendData_[destination][index].data (); + T *sendBuffer = sendData_[destination][index].data (); for( Int t=0; t::Acc( Matrix& Z, Int i, Int j ) } mpi::TaggedISend (sendBuffer, numEntries, destination, - DATA_PUT_TAG, g.VCComm(), + DATA_ACC_TAG, g.VCComm(), sendDataRequests_[destination][index]); + + // coordinates + const Int cindex = + NextIndexCoord (sendCoord_[destination], + sendCoordRequests_[destination], + sendCoordStatuses_[destination]); + + Int * coord_ = sendCoord_[destination][cindex].data (); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + + // post receive for coordinates + mpi::TaggedISend (coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + sendCoordRequests_[destination][cindex]); + mpi::ReadInc (acc_win_, 0, 1, destination); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } + - // wait for my data xfer to be over - for (Int i = 0; i < p; ++i) + // test for requests + for (Int rank = 0; rank < p; ++rank) { - // data receives - const Int numrecvDataRequests = recvDataRequests_[i].size (); - for (Int j = 0; j < numrecvDataRequests; ++j) - { - if (recvDataStatuses_[i][j]) - { - mpi::Wait (recvDataRequests_[i][j]); - recvDataStatuses_[i][j] = false; - } - } + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[rank].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + sendCoordStatuses_[rank][j] = + !mpi::Test (sendCoordRequests_[rank][j]); // data sends - const Int numsendDataRequests = sendDataRequests_[i].size (); + const Int numsendDataRequests = sendDataRequests_[rank].size (); for (Int j = 0; j < numsendDataRequests; ++j) - { - if (sendDataStatuses_[i][j]) - { - mpi::Wait (sendDataRequests_[i][j]); - sendDataStatuses_[i][j] = false; - } - } - } - - // accumulate as data xfer is over - // there must be a way to get index - for ( int rank = 0; rank < p; ++rank ) + sendDataStatuses_[rank][j] = + !mpi::Test (sendDataRequests_[rank][j]); + } + + // data/coord receive + std::vector recvVector_; + for ( int step = 0; step < p; ++step ) { - const int index = dataindices_[rank]; - const int i = recvCoord_[rank][index][0]; - const int j = recvCoord_[rank][index][1]; - const int numEntries = recvCoord_[rank][index][2]; - - // data recv'd, now accumulate - if ( numEntries > 0 ) + mpi::Status status; + if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm(), status) ) { - // Update Y - const T *Buffer = reinterpret_cast < const T * >(recvData_[rank][index].data()); + const Int source = status.MPI_SOURCE; + // coordinates + Int coord[3]; + mpi::TaggedRecv (coord, 3, source, + COORD_ACC_TAG, g.VCComm()); + Int i = coord[0]; + Int j = coord[1]; + Int count = coord[2]; + + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // data + mpi::TaggedRecv (recvBuffer, count, source, + DATA_ACC_TAG, g.VCComm ()); + + // Update Y + const T *XBuffer = recvBuffer; const Int colAlign = (Y.ColAlign () + i) % r; const Int rowAlign = (Y.RowAlign () + j) % c; @@ -1550,14 +1550,39 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) for (Int t = 0; t < localWidth; ++t) { T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &Buffer[t * localHeight]; + const T *XCol = &XBuffer[t * localHeight]; for (Int s = 0; s < localHeight; ++s) YCol[s] += XCol[s]; } } - } + } - dataindices_.clear(); + // wait for requests + for (Int rank = 0; rank < p; ++rank) + { + // coord sends + const Int numSendCoordRequests = sendCoordRequests_[rank].size (); + for (Int j = 0; j < numSendCoordRequests; ++j) + { + if (sendCoordStatuses_[rank][j]) + { + mpi::Wait (sendCoordRequests_[rank][j]); + sendCoordStatuses_[rank][j] = false; + } + } + // data sends + const Int numsendDataRequests = sendDataRequests_[rank].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + { + if (sendDataStatuses_[rank][j]) + { + mpi::Wait (sendDataRequests_[rank][j]); + sendDataStatuses_[rank][j] = false; + } + } + } + + recvVector_.clear(); } // detach collectively From a9c85ea96b1fee1facc1b26646d1c616ca6cfc34 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Sat, 8 Nov 2014 17:49:32 -0600 Subject: [PATCH 093/110] updated get/acc/put...got to test them with larger PEs --- src/core/AxpyInterface2.0.cpp | 184 +++------------------------------- 1 file changed, 12 insertions(+), 172 deletions(-) diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 4675f1713b..2c8787d485 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -907,7 +907,7 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; const Int YLDim = Y.LDim(); - + // data/coord send for( Int step=0; step::Put( Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); - const Int numEntries = localHeight * localWidth; - + const Int numEntries = localHeight * localWidth; + if( numEntries > 0 ) { const Int destination = receivingRow + r*receivingCol; @@ -933,7 +933,7 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) (Int (sendData_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = sendData_[destination][index].data (); + T *sendBuffer = sendData_[destination][index].data (); for( Int t=0; t::Put( Matrix& Z, Int i, Int j ) if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - // test for requests for (Int rank = 0; rank < p; ++rank) { - // coord sends - const Int numsendCoordRequests = sendCoordRequests_[rank].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) - sendCoordStatuses_[rank][j] = - !mpi::Test (sendCoordRequests_[rank][j]); // data sends const Int numsendDataRequests = sendDataRequests_[rank].size (); for (Int j = 0; j < numsendDataRequests; ++j) @@ -987,7 +981,8 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) // data/coord receive std::vector recvVector_; - for ( int step = 0; step < p; ++step ) + + for (Int step=0; step::Put( Matrix& Z, Int i, Int j ) for (Int t = 0; t < localWidth; ++t) { T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; + const T *XCol = &XBuffer[t * localHeight]; MemCopy (YCol, XCol, localHeight); } } @@ -1242,156 +1237,7 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) } } -/* template -void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - - // Send out the requests to all processes in the grid - for (Int rank = 0; rank < p; ++rank) - { - // send coordinates - const Int cindex = - NextIndexCoord (sendCoord_[rank], - sendCoordRequests_[rank], - sendCoordStatuses_[rank]); - - Int *coord = sendCoord_[rank][cindex].data (); - coord[0] = i; - coord[1] = j; - coord[2] = -1; - - mpi::TaggedISend (coord, 3, rank, - REQUEST_GET_TAG, g.VCComm (), - sendCoordRequests_[rank][cindex]); - - // modify get count - mpi::ReadInc (getrq_win_, 0, 1, rank); - } - - const Int myRow = g.Row(); - const Int myCol = g.Col(); - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - std::vector recvVector_; - - const Int getrq_count = mpi::ReadInc (getrq_win_, 0, 0, g.VCRank()); - - for (Int count = 0; count < getrq_count; ++count) - { - for (Int step = 0; step < p; step++) - { - mpi::Status status; - if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // post receive for coordinates - Int coord[3]; - mpi::TaggedRecv (coord, 3, source, - REQUEST_GET_TAG, g.VCComm()); - Int i = coord[0]; - Int j = coord[1]; - - // we need the localwidth/height here, - // used also to calculate numEntries - - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); - - const Int numEntries = localHeight * localWidth; - - DEBUG_ONLY (if (numEntries < Int (sizeof (T))) - LogicError ("Count was too small");) - - const Int index = - NextIndexData (numEntries, - sendData_[source], - sendDataRequests_[source], - sendDataStatuses_[source]); - - DEBUG_ONLY (if - (Int (sendData_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - - T *replyBuffer = sendData_[source][index].data (); - - for (Int t = 0; t < localWidth; ++t) - { - T *sendCol = &replyBuffer[t * localHeight]; - const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); - MemCopy (sendCol, XCol, localHeight); - } - - // Fire off non-blocking send - mpi::TaggedISend (replyBuffer, numEntries, source, - DATA_GET_TAG, g.VCComm (), - sendDataRequests_[source][index]); - } - - // receive data - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount (status); - recvVector_.resize (count); - T *recvBuffer = recvVector_.data (); - - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); - - // Compute the local heights and offsets - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Z.Buffer (0, rowShift + t * c); - const T *XCol = &recvBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] = XCol[s]; - } - } - } - } -} -*/ - -// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, -// where X is height x width - template void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) @@ -1430,7 +1276,7 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; const Int YLDim = Y.LDim(); - + // data/coord send for( Int step=0; step::Acc( Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); - const Int numEntries = localHeight * localWidth; - + const Int numEntries = localHeight * localWidth; + if( numEntries > 0 ) { const Int destination = receivingRow + r*receivingCol; @@ -1485,23 +1331,16 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord_, 3, destination, COORD_ACC_TAG, g.VCComm(), sendCoordRequests_[destination][cindex]); - mpi::ReadInc (acc_win_, 0, 1, destination); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - // test for requests for (Int rank = 0; rank < p; ++rank) { - // coord sends - const Int numsendCoordRequests = sendCoordRequests_[rank].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) - sendCoordStatuses_[rank][j] = - !mpi::Test (sendCoordRequests_[rank][j]); // data sends const Int numsendDataRequests = sendDataRequests_[rank].size (); for (Int j = 0; j < numsendDataRequests; ++j) @@ -1511,7 +1350,8 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) // data/coord receive std::vector recvVector_; - for ( int step = 0; step < p; ++step ) + + for (Int step=0; step Date: Sun, 9 Nov 2014 19:21:15 -0600 Subject: [PATCH 094/110] updated readinc to localflush instead of flush remote --- src/core/AxpyInterface2.0.cpp | 4 ++++ src/core/imports/mpi.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 2c8787d485..f2b10aabaf 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -602,6 +602,10 @@ void AxpyInterface2::Flush( Matrix& Z ) const Int r = g.Height (); const Int c = g.Width (); + mpi::Flush (put_win_); + mpi::Flush (acc_win_); + mpi::Flush (getrq_win_); + // get my put/get/acc recv counts const Int put_count = mpi::ReadInc (put_win_, 0, 0, me); const Int acc_count = mpi::ReadInc (acc_win_, 0, 0, me); diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index b658b2089c..08840d9f99 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -406,7 +406,7 @@ long ReadInc (Window & win, Aint offset, long inc, int fop_root) long otemp; SafeMpi ( MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, fop_root, offset, MPI_SUM, win) ); - SafeMpi ( MPI_Win_flush (fop_root, win) ); + SafeMpi ( MPI_Win_flush_local (fop_root, win) ); return otemp; } From a515315c36a3af279eb25da8d19abff2441d17d0 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Tue, 18 Nov 2014 18:09:55 -0600 Subject: [PATCH 095/110] introduced rma-blocking interface for put/get and ensured local completion for rma-nb...also introduced collective-flush, Cflush for axpy2, which follows nbc (incorrect usage of which shall lead to deadlock) --- include/El/core/AxpyInterface.hpp | 8 +- src/core/AxpyInterface.cpp | 54 ++--- src/core/AxpyInterface2.0.cpp | 95 +++++++-- src/core/RmaInterface.cpp | 339 +++++++++++++++++++++++++++++- src/core/imports/mpi.cpp | 1 + 5 files changed, 435 insertions(+), 62 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index c0a8d3d5a6..ced31f2ddf 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -57,6 +57,10 @@ class AxpyInterface #else std::vector sentEomTo_, haveEomFrom_; std::vector eomSendRequests_; + // Check if we are done with this attachment's work + bool Finished(); + // Progress functions + void UpdateRequestStatuses(); #endif std::vector> @@ -70,10 +74,6 @@ class AxpyInterface byte sendDummy_, recvDummy_; - // Check if we are done with this attachment's work - bool Finished(); - // Progress functions - void UpdateRequestStatuses(); #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) bool ReturnRequestStatuses(); #else diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 3ef28f35d3..7ffc0210b5 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -710,35 +710,10 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) return numCreated; } - template < typename T > void AxpyInterface < T >::UpdateRequestStatuses () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); - const Int p = g.Size (); - - for (Int i = 0; i < p; ++i) - { - const Int numDataSendRequests = dataSendRequests_[i].size (); - for (Int j = 0; j < numDataSendRequests; ++j) - if (sendingData_[i][j]) - sendingData_[i][j] = !mpi::Test (dataSendRequests_[i][j]); - const Int numRequestSendRequests = requestSendRequests_[i].size (); - for (Int j = 0; j < numRequestSendRequests; ++j) - if (sendingRequest_[i][j]) - sendingRequest_[i][j] = !mpi::Test (requestSendRequests_[i][j]); - const Int numReplySendRequests = replySendRequests_[i].size (); - for (Int j = 0; j < numReplySendRequests; ++j) - if (sendingReply_[i][j]) - sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); - } - } - #if MPI_VERSION>=3 && defined(EL_USE_NONBLOCKING_CONSENSUS) template < typename T > bool AxpyInterface < T >::ReturnRequestStatuses () { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::ReturnRequestStatuses")) const Grid & g = (attachedForLocalToGlobal_ ? localToGlobalMat_->Grid () : globalToLocalMat_->Grid ()); @@ -772,7 +747,32 @@ template < typename T > bool AxpyInterface < T >::ReturnRequestStatuses () } } return true; - } + } +#else + template < typename T > void AxpyInterface < T >::UpdateRequestStatuses () + { + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) + const Grid & g = (attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid ()); + const Int p = g.Size (); + + for (Int i = 0; i < p; ++i) + { + const Int numDataSendRequests = dataSendRequests_[i].size (); + for (Int j = 0; j < numDataSendRequests; ++j) + if (sendingData_[i][j]) + sendingData_[i][j] = !mpi::Test (dataSendRequests_[i][j]); + const Int numRequestSendRequests = requestSendRequests_[i].size (); + for (Int j = 0; j < numRequestSendRequests; ++j) + if (sendingRequest_[i][j]) + sendingRequest_[i][j] = !mpi::Test (requestSendRequests_[i][j]); + const Int numReplySendRequests = replySendRequests_[i].size (); + for (Int j = 0; j < numReplySendRequests; ++j) + if (sendingReply_[i][j]) + sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); + } + } #endif template < typename T > void AxpyInterface < T >::Detach () diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index f2b10aabaf..e98d794891 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -286,6 +286,8 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const T* XBuffer = Z.LockedBuffer(); + // put count + mpi::ReadInc (put_win_, 0, 1, destination); const Int dindex = NextIndexData (numEntries, @@ -324,9 +326,6 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord, 3, destination, COORD_PUT_TAG, g.VCComm (), sendCoordRequests_[destination][cindex]); - - // put count - mpi::ReadInc (put_win_, 0, 1, destination); } receivingRow = (receivingRow + 1) % r; @@ -358,6 +357,9 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) // Send out the requests to all processes in the grid for (Int rank = 0; rank < p; ++rank) { + // modify get count + mpi::ReadInc (getrq_win_, 0, 1, rank); + // send coordinates const Int cindex = NextIndexCoord (sendCoord_[rank], @@ -372,9 +374,6 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord, 3, rank, REQUEST_GET_TAG, g.VCComm (), sendCoordRequests_[rank][cindex]); - - // modify get count - mpi::ReadInc (getrq_win_, 0, 1, rank); } } @@ -428,6 +427,8 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const T* XBuffer = Z.LockedBuffer(); + // acc count + mpi::ReadInc (acc_win_, 0, 1, destination); // send data const Int dindex = @@ -467,9 +468,6 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) mpi::TaggedISend (coord, 3, destination, COORD_ACC_TAG, g.VCComm(), sendCoordRequests_[destination][cindex]); - - // acc count - mpi::ReadInc (acc_win_, 0, 1, destination); } receivingRow = (receivingRow + 1) % r; @@ -637,6 +635,62 @@ void AxpyInterface2::Flush( Matrix& Z ) mpi::ReadInc (getrq_win_, 0, -getrq_count, me); } +// This is collective flush, this requires all PEs +// to invoke the Ibarrier, incorrect usage will lead +// to deadlocks +template +void AxpyInterface2::Cflush( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Cflush")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + + mpi::Status status; + + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + + while ( !DONE ) + { + // similar to HandleXYZ functions in original AxpyInterface + if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) + { + switch (status.MPI_TAG) + { + case DATA_PUT_TAG: + { + HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); + } + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); + } + case REQUEST_GET_TAG: + { + HandleGlobalToLocalData ( Z ); + } + } + } + if ( nb_bar_active ) + { + DONE = mpi::Test ( nb_bar_request ); + } + else + { + // all sends (data or request) are complete + if ( TestRequests( Z ) ) + { + mpi::IBarrier ( g.VCComm(), nb_bar_request ); + nb_bar_active = true; + } + } + } +} + template < typename T > void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int source ) { @@ -748,6 +802,7 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int source ) for (Int s = 0; s < localHeight; ++s) YCol[s] += XCol[s]; } + // Free the memory getVector_.clear(); } @@ -792,7 +847,6 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) // we need the localwidth/height here, // used also to calculate numEntries - const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; @@ -868,6 +922,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) } } } + recvVector_.clear(); } @@ -1280,7 +1335,10 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; const Int YLDim = Y.LDim(); - + + // data/coord receive + std::vector recvVector_; + // data/coord send for( Int step=0; step::Acc( Matrix& Z, Int i, Int j ) COORD_ACC_TAG, g.VCComm(), sendCoordRequests_[destination][cindex]); } - + receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; @@ -1345,16 +1403,13 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) // test for requests for (Int rank = 0; rank < p; ++rank) { - // data sends - const Int numsendDataRequests = sendDataRequests_[rank].size (); - for (Int j = 0; j < numsendDataRequests; ++j) - sendDataStatuses_[rank][j] = - !mpi::Test (sendDataRequests_[rank][j]); + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[rank].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + sendCoordStatuses_[rank][j] = + !mpi::Test (sendCoordRequests_[rank][j]); } - // data/coord receive - std::vector recvVector_; - for (Int step=0; step=3 namespace El @@ -178,7 +176,8 @@ Int RmaInterface::NextIndex std::deque > &dataVectors) { DEBUG_ONLY (CallStackEntry cse ("RmaInterface::NextIndex")) - const Int numCreated = dataVectors.size (); + + const Int numCreated = dataVectors.size (); dataVectors.resize (numCreated + 1); dataVectors[numCreated].resize (dataSize); @@ -186,6 +185,7 @@ Int RmaInterface::NextIndex return numCreated; } +// Blocking template void RmaInterface::Put( Matrix& Z, Int i, Int j ) { @@ -257,6 +257,310 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } + // Flush all transfers + mpi::Flush (window); +} + +template +void RmaInterface::Put( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int iLocalOffset = Length( i, Y.ColShift (), r ); + const Int jLocalOffset = Length( j, Y.RowShift (), c ); + + const Int YLDim = Y.LDim (); + + for( Int step=0; step::NextIndex ( numEntries, + putVector_[destination]); + T* sendBuffer = putVector_[destination][index].data(); + const T* XBuffer = Z.LockedBuffer(); + + for( Int t=0; t +void RmaInterface::Acc( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated."); + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative."); + + DistMatrix& Y = *GlobalArrayPut_; + + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix."); + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + const Int YLDim = Y.LDim (); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + const Int iLocalOffset = Length( i, Y.ColShift (), r ); + const Int jLocalOffset = Length( j, Y.RowShift (), c ); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step::NextIndex ( numEntries, + putVector_[destination]); + + T* sendBuffer = putVector_[destination][index].data(); + T* XBuffer = Z.Buffer(); + + for( Int t=0; t +void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated."); + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative."); + + DistMatrix& Y = *GlobalArrayPut_; + + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix."); + + //do rma related checks + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + const Int YLDim = Y.LDim (); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + const Int iLocalOffset = Length( i, Y.ColShift (), r ); + const Int jLocalOffset = Length( j, Y.RowShift (), c ); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step::NextIndex ( numEntries, + putVector_[destination]); + T* sendBuffer = putVector_[destination][index].data(); + const T* XBuffer = Z.LockedBuffer(); + + for( Int t=0; t +void RmaInterface::Iput( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iput")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int iLocalOffset = Length( i, Y.ColShift (), r ); + const Int jLocalOffset = Length( j, Y.RowShift (), c ); + + const Int YLDim = Y.LDim (); + + for( Int step=0; step::NextIndex ( numEntries, + putVector_[destination]); + + T* sendBuffer = putVector_[destination][index].data(); + T* XBuffer = Z.Buffer(); + + for( Int t=0; t::Put( Matrix& Z, Int i, Int j ) } template -void RmaInterface::Put( const Matrix& Z, Int i, Int j ) +void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iput")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -329,6 +633,7 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -421,9 +726,9 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template -void RmaInterface::Acc( Matrix& Z, Int i, Int j ) +void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iacc")) if ( !toBeAttachedForPut_ ) LogicError("Global matrix cannot be updated."); @@ -487,6 +792,7 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -498,9 +804,9 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) } template -void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) +void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iacc")) if ( !toBeAttachedForPut_ ) LogicError("Global matrix cannot be updated."); @@ -563,6 +869,7 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -672,6 +979,11 @@ void RmaInterface::Flush( Matrix& Z ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); + // Get does not require flush because + // we guarantee local completion for + // Put/Get/Acc + if( toBeAttachedForGet_ ) + return; mpi::Flush ( window ); } @@ -681,9 +993,14 @@ void RmaInterface::Flush( const Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - + // Get does not require flush because + // we guarantee local completion for + // Put/Get/Acc + if( toBeAttachedForGet_ ) + return; + mpi::Flush ( window ); } diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 08840d9f99..9c5c79a8af 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -643,6 +643,7 @@ void VectorDatatype (El_iov_t * vect_descr, void WindowFree (Window & window) { + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowFree")) SafeMpi (MPI_Win_free (&window)); } From 2f802f134b9aefb94222157d05563b227b40c60e Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 19 Nov 2014 13:59:36 -0600 Subject: [PATCH 096/110] forgot to add cflush in header --- include/El/core/AxpyInterface2.0.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 0e5e6caa74..1a1e831194 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -35,6 +35,9 @@ class AxpyInterface2 void Flush( Matrix& Z ); void Flush( const Matrix& Z ); + void Cflush( Matrix& Z ); + void Cflush( const Matrix& Z ); + // blocking update routines void Put( Matrix& Z, Int i, Int j ); void Put( const Matrix& Z, Int i, Int j ); From 7ba1ccea295bb118527c5d63ee8090da75bab1d3 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 19 Nov 2014 15:12:50 -0600 Subject: [PATCH 097/110] forgot to add function signatures --- include/El/core/RmaInterface.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 9a2be10b55..bfac1206cb 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -38,6 +38,12 @@ class RmaInterface void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); + void Iput( Matrix& Z, Int i, Int j ); + void Iput( const Matrix& Z, Int i, Int j ); + + void Iacc( Matrix& Z, Int i, Int j ); + void Iacc( const Matrix& Z, Int i, Int j ); + void Flush( Matrix& Z, Int i, Int j ); void Flush( const Matrix& Z, Int i, Int j ); void Flush( Matrix& Z ); From ef56b300a5372665f9a4e21c2b1e1268f6242a2a Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 19 Nov 2014 18:55:08 -0600 Subject: [PATCH 098/110] test on data rather than coords --- src/core/AxpyInterface2.0.cpp | 38 ++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index e98d794891..437e8edee1 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -1152,11 +1152,11 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) coord[1] = j; coord[2] = -1; - mpi::TaggedISend (coord, 3, rank, + mpi::TaggedISSend (coord, 3, rank, REQUEST_GET_TAG, g.VCComm (), sendCoordRequests_[rank][cindex]); } - + // test for requests for (Int rank = 0; rank < p; ++rank) { @@ -1165,8 +1165,8 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) for (Int j = 0; j < numsendCoordRequests; ++j) sendCoordStatuses_[rank][j] = !mpi::Test (sendCoordRequests_[rank][j]); - } - + } + for ( Int step = 0; step < p; ++step ) { mpi::Status status; @@ -1227,11 +1227,6 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // wait/test for send requests for (Int rank = 0; rank < p; ++rank) { - // data sends - const Int numsendDataRequests = sendDataRequests_[rank].size (); - for (Int j = 0; j < numsendDataRequests; ++j) - sendDataStatuses_[rank][j] = - !mpi::Test (sendDataRequests_[rank][j]); // coord sends const Int numSendCoordRequests = sendCoordRequests_[rank].size (); for (Int j = 0; j < numSendCoordRequests; ++j) @@ -1242,8 +1237,13 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) sendCoordStatuses_[rank][j] = false; } } + // data sends + const Int numsendDataRequests = sendDataRequests_[rank].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + sendDataStatuses_[rank][j] = + !mpi::Test (sendDataRequests_[rank][j]); } - + // receive data for (Int step = 0; step < p; ++step) { @@ -1283,6 +1283,16 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // wait for send requests for (Int rank = 0; rank < p; ++rank) { + // coord sends + const Int numSendCoordRequests = sendCoordRequests_[rank].size (); + for (Int j = 0; j < numSendCoordRequests; ++j) + { + if (sendCoordStatuses_[rank][j]) + { + mpi::Wait (sendCoordRequests_[rank][j]); + sendCoordStatuses_[rank][j] = false; + } + } // data sends const Int numsendDataRequests = sendDataRequests_[rank].size (); for (Int j = 0; j < numsendDataRequests; ++j) @@ -1404,10 +1414,10 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) for (Int rank = 0; rank < p; ++rank) { // coord sends - const Int numsendCoordRequests = sendCoordRequests_[rank].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) - sendCoordStatuses_[rank][j] = - !mpi::Test (sendCoordRequests_[rank][j]); + const Int numsendDataRequests = sendDataRequests_[rank].size (); + for (Int j = 0; j < numsendDataRequests; ++j) + sendDataStatuses_[rank][j] = + !mpi::Test (sendDataRequests_[rank][j]); } for (Int step=0; step Date: Thu, 20 Nov 2014 12:43:57 -0600 Subject: [PATCH 099/110] added local completion logic --- include/El/core/AxpyInterface2.0.hpp | 6 + include/El/core/RmaInterface.hpp | 2 + src/core/AxpyInterface2.0.cpp | 486 ++++++++++++++++----------- src/core/RmaInterface.cpp | 17 +- 4 files changed, 301 insertions(+), 210 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 1a1e831194..430a7064e5 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -47,6 +47,12 @@ class AxpyInterface2 void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); + void Eput( Matrix& Z, Int i, Int j ); + void Eput( const Matrix& Z, Int i, Int j ); + + void Eacc( Matrix& Z, Int i, Int j ); + void Eacc( const Matrix& Z, Int i, Int j ); + void Detach(); private: diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index bfac1206cb..432125395a 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -30,6 +30,7 @@ class RmaInterface void Attach( DistMatrix& Z ); void Attach( const DistMatrix& Z ); + // Local completion void Put( Matrix& Z, Int i, Int j ); void Put( const Matrix& Z, Int i, Int j ); @@ -38,6 +39,7 @@ class RmaInterface void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); + // No local completion void Iput( Matrix& Z, Int i, Int j ); void Iput( const Matrix& Z, Int i, Int j ); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 437e8edee1..0d0ce9b2bd 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -3,6 +3,8 @@ This file is part of Elemental and is under the BSD 2-Clause License, which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ +/* This interface uses MPI-3 RMA (in the ReadInc function) + */ #include "El-lite.hpp" #include @@ -239,6 +241,288 @@ template return numCreated; } +// nonblocking, local completion + template +void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim (); + + for( Int step=0; step(sendCoord_[destination][cindex].data ()); + coord[0] = i; + coord[1] = j; + coord[2] = numEntries; + + mpi::TaggedISend (coord, 3, destination, + COORD_PUT_TAG, g.VCComm (), + sendCoordRequests_[destination][cindex]); + } + + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } + + // local completion + for (int i = 0; i < p; i++) + { + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[i].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + { + if (sendCoordStatuses_[i][j]) + mpi::Wait (sendCoordRequests_[i][j]); + sendCoordStatuses_[i][j] = false; + } + + // data recvs + const Int numrecvDataRequests = recvDataRequests_[i].size (); + for (Int j = 0; j < numrecvDataRequests; ++j) + { + if (recvDataStatuses_[i][j]) + mpi::Wait (recvDataRequests_[i][j]); + recvDataStatuses_[i][j] = false; + } + } +} + +template +void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iget")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + DistMatrix& X = *GlobalArrayGet_; + + const Int height = Z.Height (); + const Int width = Z.Width (); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid submatrix for Iget"); + + const Grid & g = X.Grid (); + const Int p = g.Size (); + + // Send out the requests to all processes in the grid + for (Int rank = 0; rank < p; ++rank) + { + // modify get count + mpi::ReadInc (getrq_win_, 0, 1, rank); + + // send coordinates + const Int cindex = + NextIndexCoord (sendCoord_[rank], + sendCoordRequests_[rank], + sendCoordStatuses_[rank]); + + Int *coord = sendCoord_[rank][cindex].data (); + coord[0] = i; + coord[1] = j; + coord[2] = -1; + + mpi::TaggedISend (coord, 3, rank, + REQUEST_GET_TAG, g.VCComm (), + sendCoordRequests_[rank][cindex]); + } +} + +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width +template +void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iacc")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim(); + + for( Int step=0; step(sendCoord_[destination][cindex].data()); + coord[0] = i; + coord[1] = j; + coord[2] = numEntries; + + mpi::TaggedISend (coord, 3, destination, + COORD_ACC_TAG, g.VCComm(), + sendCoordRequests_[destination][cindex]); + } + + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } + // local completion + for (int i = 0; i < p; i++) + { + // coord sends + const Int numsendCoordRequests = sendCoordRequests_[i].size (); + for (Int j = 0; j < numsendCoordRequests; ++j) + { + if (sendCoordStatuses_[i][j]) + mpi::Wait (sendCoordRequests_[i][j]); + sendCoordStatuses_[i][j] = false; + } + + // data recvs + const Int numrecvDataRequests = recvDataRequests_[i].size (); + for (Int j = 0; j < numrecvDataRequests; ++j) + { + if (recvDataStatuses_[i][j]) + mpi::Wait (recvDataRequests_[i][j]); + recvDataStatuses_[i][j] = false; + } + } +} + +// nonblocking, no local completion template void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) { @@ -926,11 +1210,11 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) recvVector_.clear(); } -// blocking update routines +// end-to-end blocking update routines template -void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eput")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -1115,201 +1399,9 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) } template -void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myRow = g.Row(); - const Int myCol = g.Col(); - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - std::vector recvVector_; - - // Send out the get requests to everyone in grid - for (Int rank = 0; rank < p; ++rank) - { - // send coordinates - const Int cindex = - NextIndexCoord (sendCoord_[rank], - sendCoordRequests_[rank], - sendCoordStatuses_[rank]); - - Int *coord = sendCoord_[rank][cindex].data (); - coord[0] = i; - coord[1] = j; - coord[2] = -1; - - mpi::TaggedISSend (coord, 3, rank, - REQUEST_GET_TAG, g.VCComm (), - sendCoordRequests_[rank][cindex]); - } - - // test for requests - for (Int rank = 0; rank < p; ++rank) - { - // coord sends - const Int numsendCoordRequests = sendCoordRequests_[rank].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) - sendCoordStatuses_[rank][j] = - !mpi::Test (sendCoordRequests_[rank][j]); - } - - for ( Int step = 0; step < p; ++step ) - { - mpi::Status status; - if (mpi::IProbe - (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // post receive for coordinates - Int coord[3]; - mpi::TaggedRecv (coord, 3, source, - REQUEST_GET_TAG, g.VCComm()); - Int i = coord[0]; - Int j = coord[1]; - - // we need the localwidth/height here, - // used also to calculate numEntries - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); - - const Int numEntries = localHeight * localWidth; - - DEBUG_ONLY (if (numEntries < Int (sizeof (T))) - LogicError ("Count was too small");) - - const Int index = - NextIndexData (numEntries, - sendData_[source], - sendDataRequests_[source], - sendDataStatuses_[source]); - - DEBUG_ONLY (if - (Int (sendData_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - - T *replyBuffer = sendData_[source][index].data (); - for (Int t = 0; t < localWidth; ++t) - { - T *sendCol = &replyBuffer[t * localHeight]; - const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); - MemCopy (sendCol, XCol, localHeight); - } - - // Fire off non-blocking send - mpi::TaggedISend (replyBuffer, numEntries, source, - DATA_GET_TAG, g.VCComm (), - sendDataRequests_[source][index]); - } - } - - // wait/test for send requests - for (Int rank = 0; rank < p; ++rank) - { - // coord sends - const Int numSendCoordRequests = sendCoordRequests_[rank].size (); - for (Int j = 0; j < numSendCoordRequests; ++j) - { - if (sendCoordStatuses_[rank][j]) - { - mpi::Wait (sendCoordRequests_[rank][j]); - sendCoordStatuses_[rank][j] = false; - } - } - // data sends - const Int numsendDataRequests = sendDataRequests_[rank].size (); - for (Int j = 0; j < numsendDataRequests; ++j) - sendDataStatuses_[rank][j] = - !mpi::Test (sendDataRequests_[rank][j]); - } - - // receive data - for (Int step = 0; step < p; ++step) - { - mpi::Status status; - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount (status); - recvVector_.resize (count); - T *recvBuffer = recvVector_.data (); - - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); - - // Compute the local heights and offsets - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Z.Buffer (0, rowShift + t * c); - const T *XCol = &recvBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] = XCol[s]; - } - } - } - - // wait for send requests - for (Int rank = 0; rank < p; ++rank) - { - // coord sends - const Int numSendCoordRequests = sendCoordRequests_[rank].size (); - for (Int j = 0; j < numSendCoordRequests; ++j) - { - if (sendCoordStatuses_[rank][j]) - { - mpi::Wait (sendCoordRequests_[rank][j]); - sendCoordStatuses_[rank][j] = false; - } - } - // data sends - const Int numsendDataRequests = sendDataRequests_[rank].size (); - for (Int j = 0; j < numsendDataRequests; ++j) - { - if (sendDataStatuses_[rank][j]) - { - mpi::Wait (sendDataRequests_[rank][j]); - sendDataStatuses_[rank][j] = false; - } - } - } -} - -template -void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eacc")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 8834b7f694..fe7aaead4b 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -185,7 +185,7 @@ Int RmaInterface::NextIndex return numCreated; } -// Blocking +// Locally Blocking template void RmaInterface::Put( Matrix& Z, Int i, Int j ) { @@ -252,13 +252,12 @@ void RmaInterface::Put( Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - // Flush all transfers - mpi::Flush (window); } template @@ -326,13 +325,12 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - // Flush all transfers - mpi::Flush (window); } // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, @@ -404,13 +402,12 @@ void RmaInterface::Acc( Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - // Flush all transfers - mpi::Flush (window); } template @@ -485,8 +482,6 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } - // Flush all transfers - mpi::Flush (window); } template @@ -555,7 +550,6 @@ void RmaInterface::Iput( Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -633,7 +627,6 @@ void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -792,7 +785,6 @@ void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -869,7 +861,6 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) mpi::Iacc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } - mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) From 57093d6f446bf46ea7169ac8b62da90c1e1da452 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 26 Jan 2015 23:05:06 -0800 Subject: [PATCH 100/110] changed the data structure, now we have coords and matrices as structs...added non-collective flush and some synch functions --- include/El/core/AxpyInterface2.0.hpp | 124 +- include/El/core/imports/mpi.hpp | 1 - src/core/AxpyInterface2.0.cpp | 1992 +++++++++++++------------- src/core/imports/mpi.cpp | 51 +- 4 files changed, 1123 insertions(+), 1045 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 430a7064e5..87c683d0e2 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -7,7 +7,6 @@ #ifndef EL_AXPYINTERFACE2_HPP #define EL_AXPYINTERFACE2_HPP -#if MPI_VERSION>=3 namespace El { template class AxpyInterface2 @@ -19,11 +18,15 @@ class AxpyInterface2 AxpyInterface2( DistMatrix& Z ); AxpyInterface2( const DistMatrix& Z ); + // collective epoch initialization routines void Attach( DistMatrix& Z ); void Attach( const DistMatrix& Z ); + void Detach(); - // nonblocking update routines - // requires flush for completion + // remote update routines + + // requires Flush for local+remote + // completion void Iput( Matrix& Z, Int i, Int j ); void Iput( const Matrix& Z, Int i, Int j ); @@ -31,14 +34,9 @@ class AxpyInterface2 void Iacc( Matrix& Z, Int i, Int j ); void Iacc( const Matrix& Z, Int i, Int j ); - - void Flush( Matrix& Z ); - void Flush( const Matrix& Z ); - void Cflush( Matrix& Z ); - void Cflush( const Matrix& Z ); - - // blocking update routines + // locally blocking update routines + // currently not implemented void Put( Matrix& Z, Int i, Int j ); void Put( const Matrix& Z, Int i, Int j ); @@ -47,14 +45,10 @@ class AxpyInterface2 void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); - void Eput( Matrix& Z, Int i, Int j ); - void Eput( const Matrix& Z, Int i, Int j ); - - void Eacc( Matrix& Z, Int i, Int j ); - void Eacc( const Matrix& Z, Int i, Int j ); - - void Detach(); - + // synchronization routines + void Flush( Matrix& Z ); + void Flush( const Matrix& Z ); + private: static const Int @@ -63,26 +57,36 @@ class AxpyInterface2 DATA_ACC_TAG =3, REQUEST_GET_TAG =4, COORD_ACC_TAG =5, - COORD_PUT_TAG =6; + COORD_PUT_TAG =6; + + // struct for passing data + struct matrix_params_ + { + T *base_; + std::vector>> + data_; + std::vector> + requests_; + std::vector> + statuses_; + }; + + std::vector matrices_; + + // struct for passing coordinates + struct coord_params_ + { + T *base_; + std::vector>> + coord_; + std::vector> + requests_; + std::vector> + statuses_; + }; + + std::vector coords_; - // request statuses - std::vector> - sendDataStatuses_, sendCoordStatuses_, - recvDataStatuses_, recvCoordStatuses_; - - // request handles - std::vector> - sendDataRequests_, sendCoordRequests_, - recvDataRequests_, recvCoordRequests_; - - // data - std::vector>> - sendData_, recvData_; - - // coords - std::vector>> - sendCoord_, recvCoord_; - // TODO need to add const here... DistMatrix* GlobalArrayPut_; DistMatrix* GlobalArrayGet_; @@ -90,29 +94,30 @@ class AxpyInterface2 bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; - // op count window for read increment - mpi::Window put_win_, acc_win_, getrq_win_; - long *put_win_base_, *acc_win_base_, - *getrq_win_base_; - // next index for data and coord - Int NextIndexData - ( Int dataSize, - std::deque>& data, - std::deque& requests, - std::deque& requestStatuses ); - - Int NextIndexCoord - ( std::deque>& coord, - std::deque& requests, - std::deque& requestStatuses ); - - // TODO - Int GetIndexData( Matrix& Z ); - Int GetIndexCoord( Matrix& Z ); - - bool TestRequests( Matrix& Z ); - void WaitRequests( Matrix& Z ); + Int NextIndexData ( + Int target, + Int dataSize, + T * base_address, + Int *mindex); + + Int NextIndexCoord ( + Int i, Int j, + Int target, + T * base_address, + Int *cindex); + + bool Testall(); + bool Test( Matrix& Z ); + bool Test( const Matrix& Z ); + bool TestAny( Matrix& Z ); + bool TestAny( const Matrix& Z ); + + void Waitall(); + void Wait( Matrix& Z ); + void Wait( const Matrix& Z ); + void WaitAny( Matrix& Z ); + void WaitAny( const Matrix& Z ); // these are only used for nonblocking // update rountines @@ -125,5 +130,4 @@ class AxpyInterface2 void HandleLocalToGlobalAcc( const Matrix& Z, Int source ); }; } // namespace El -#endif // MPI-3 #endif // ifndef EL_AXPYINTERFACE2_HPP diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index c45b86d061..c09b08748e 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -351,7 +351,6 @@ void Wait( Request& request ); void Wait( Request& request, Status& status ); //TODO add another function for getting statuses void WaitAny (int numRequests, Request * requests, Int * index); - void WaitAll( int numRequests, Request* requests ); void WaitAll( int numRequests, Request* requests, Status* statuses ); bool Test( Request& request ); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 0d0ce9b2bd..0aad2e4367 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -3,8 +3,6 @@ This file is part of Elemental and is under the BSD 2-Clause License, which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ -/* This interface uses MPI-3 RMA (in the ReadInc function) - */ #include "El-lite.hpp" #include @@ -15,16 +13,9 @@ namespace El template AxpyInterface2::AxpyInterface2() : GlobalArrayPut_(0), GlobalArrayGet_(0), - sendDataStatuses_(0), sendCoordStatuses_(0), - recvDataStatuses_(0), recvCoordStatuses_(0), - sendDataRequests_(0), sendCoordRequests_(0), - recvDataRequests_(0), recvCoordRequests_(0), - sendData_(0), recvData_(0), - sendCoord_(0), recvCoord_(0), - put_win_(0), acc_win_(0), getrq_win_(0), - put_win_base_(0), acc_win_base_(0), getrq_win_base_(0), + matrices_(0), coords_(0), toBeAttachedForGet_(false), toBeAttachedForPut_(false), - attached_(false), detached_(false) + attached_(false), detached_(false) { } template @@ -42,42 +33,29 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) const Grid& g = Z.Grid(); const Int p = g.Size (); - if ( sendData_.empty() ) + if ( matrices_.empty() ) { - sendDataStatuses_.resize (p); - sendCoordStatuses_.resize (p); - recvDataStatuses_.resize (p); - recvCoordStatuses_.resize (p); - - sendDataRequests_.resize (p); - sendCoordRequests_.resize (p); - recvDataRequests_.resize (p); - recvCoordRequests_.resize (p); - - sendData_.resize (p); - sendCoord_.resize (p); - recvData_.resize (p); - recvCoord_.resize (p); + struct matrix_params_ mp; + mp.data_.resize(p); + mp.requests_.resize(p); + mp.statuses_.resize(p); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if ( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize(p); + cp.requests_.resize(p); + cp.statuses_.resize(p); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); } - - // count window related - put_win_base_ = new long; - mpi::WindowCreate ( put_win_base_, sizeof(long), - g.VCComm(), put_win_ ); - memset (put_win_base_, 0, sizeof (long)); - mpi::WindowLock (put_win_); - - acc_win_base_ = new long; - mpi::WindowCreate ( acc_win_base_, sizeof(long), - g.VCComm(), acc_win_ ); - memset (acc_win_base_, 0, sizeof (long)); - mpi::WindowLock (acc_win_); - - getrq_win_base_ = new long; - mpi::WindowCreate ( getrq_win_base_, sizeof(long), - g.VCComm(), getrq_win_ ); - memset (getrq_win_base_, 0, sizeof (long)); - mpi::WindowLock (getrq_win_); } template @@ -101,423 +79,633 @@ AxpyInterface2::~AxpyInterface2() } } - template -void AxpyInterface2::Attach( DistMatrix& Z ) +template +Int AxpyInterface2::NextIndexData ( + Int target, + Int dataSize, + T * base_address, + Int *mindex) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Attach")) - // attached_ will be only set in Attach - // and only unset in Detach - if (!attached_) - attached_ = true; - else - LogicError("Must detach before reattaching."); + DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexData")) + + assert ( base_address != NULL ); - // the matrix base_ is not known until - // an update operation (put/get/acc) - // so it is kept blank + Int matrixIndex = 0; + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + const Int numMatrices = matrices_.size(); - // if DistMatrix is non-const, all one-sided - // transfers -- put, get and acc are possible - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + // search for matrix base + for (Int m = 0; m < numMatrices; m++) { - GlobalArrayPut_ = &Z; - toBeAttachedForPut_ = true; - GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; - - const Grid& g = Z.Grid(); - const Int p = g.Size (); - - if ( sendData_.empty() ) + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + // uninitiated, first time + if ( matrices_[m].base_ == NULL ) { - sendDataStatuses_.resize (p); - sendCoordStatuses_.resize (p); - recvDataStatuses_.resize (p); - recvCoordStatuses_.resize (p); - - sendDataRequests_.resize (p); - sendCoordRequests_.resize (p); - recvDataRequests_.resize (p); - recvCoordRequests_.resize (p); - - sendData_.resize (p); - sendCoord_.resize (p); - recvData_.resize (p); - recvCoord_.resize (p); + matrices_[m].base_ = base_address; + matrixIndex = m; + break; } - // count window related - put_win_base_ = new long; - mpi::WindowCreate ( put_win_base_, sizeof(long), - g.VCComm(), put_win_ ); - memset (put_win_base_, 0, sizeof (long)); - mpi::WindowLock (put_win_); - - acc_win_base_ = new long; - mpi::WindowCreate ( acc_win_base_, sizeof(long), - g.VCComm(), acc_win_ ); - memset (acc_win_base_, 0, sizeof (long)); - mpi::WindowLock (acc_win_); - - getrq_win_base_ = new long; - mpi::WindowCreate ( getrq_win_base_, sizeof(long), - g.VCComm(), getrq_win_ ); - memset (getrq_win_base_, 0, sizeof (long)); - mpi::WindowLock (getrq_win_); + matrixIndex = m+1; } -} + + // need to create new object + if ( matrixIndex == numMatrices) + { + struct matrix_params_ mp; + mp.data_.resize(p); + mp.requests_.resize(p); + mp.statuses_.resize(p); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + matrices_[matrixIndex].base_ = base_address; + } + // go through the request, data, + // status objects + const Int numCreated = matrices_[matrixIndex].data_[target].size (); + DEBUG_ONLY (if (numCreated != Int (matrices_[matrixIndex].requests_[target].size ()) || + numCreated != Int (matrices_[matrixIndex].statuses_[target].size ())) + LogicError ("size mismatch");) -template - Int AxpyInterface2::NextIndexData - (Int dataSize, - std::deque < std::vector < T >> &data, - std::deque < mpi::Request > &requests, - std::deque < bool > &requestStatuses) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexData")) - const Int numCreated = data.size (); - DEBUG_ONLY (if (numCreated != Int (requests.size ()) || - numCreated != - Int (requestStatuses.size ()))LogicError - ("size mismatch");) for (Int i = 0; i < numCreated; ++i) { - // If this request is still running, test to see if it finished. - if (requestStatuses[i]) + // If this request is still running, + // test to see if it finished. + if (matrices_[matrixIndex].statuses_[target][i]) { - const bool finished = mpi::Test (requests[i]); - requestStatuses[i] = !finished; + const bool finished = mpi::Test (matrices_[matrixIndex].requests_[target][i]); + matrices_[matrixIndex].statuses_[target][i] = !finished; } - if (!requestStatuses[i]) + if (!matrices_[matrixIndex].statuses_[target][i]) { - requestStatuses[i] = true; - data[i].resize (dataSize); - return i; + matrices_[matrixIndex].statuses_[target][i] = true; + matrices_[matrixIndex].data_[target][i].resize ( dataSize ); + *mindex = matrixIndex; + return i; } } - data.resize (numCreated + 1); - data[numCreated].resize (dataSize); - requests.push_back (mpi::REQUEST_NULL); - requestStatuses.push_back (true); - + matrices_[matrixIndex].data_[target].resize ( numCreated + 1 ); + matrices_[matrixIndex].data_[target][numCreated].resize ( dataSize ); + matrices_[matrixIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); + matrices_[matrixIndex].statuses_[target].push_back ( true ); + *mindex = matrixIndex; + return numCreated; - } +} template - Int AxpyInterface2::NextIndexCoord - (std::deque < std::array > &coord, - std::deque < mpi::Request > &requests, - std::deque < bool > &requestStatuses) - { +Int AxpyInterface2::NextIndexCoord ( + Int i, Int j, + Int target, + T * base_address, + Int *cindex) +{ DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexCoord")) - const Int numCreated = coord.size (); - DEBUG_ONLY (if (numCreated != Int (requests.size ()) || - numCreated != - Int (requestStatuses.size ()))LogicError - ("size mismatch");) - + assert ( base_address != NULL ); + + Int coordIndex = 0; + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + const Int numCoords = coords_.size(); + + // search for matrix base + for (Int m = 0; m < numCoords; m++) + { + if ( coords_[m].base_ == base_address ) + { + coordIndex = m; + break; + } + if ( coords_[m].base_ == NULL ) + { + coords_[m].base_ = base_address; + coordIndex = m; + break; + } + coordIndex = m+1; + } + + // need to create new object + if ( coordIndex == numCoords ) + { + struct coord_params_ cp; + cp.coord_.resize(p); + cp.requests_.resize(p); + cp.statuses_.resize(p); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + coords_[coordIndex].base_ = base_address; + } + // go through the request, data, + // status objects + const Int numCreated = coords_[coordIndex].coord_[target].size (); + DEBUG_ONLY (if (numCreated != Int (coords_[coordIndex].requests_[target].size ()) || + numCreated != Int (matrices_[coordIndex].statuses_[target].size ())) + LogicError ("size mismatch");) + for (Int i = 0; i < numCreated; ++i) { - // If this request is still running, test to see if it finished. - if (requestStatuses[i]) + // If this request is still running, + // test to see if it finished. + if (coords_[coordIndex].statuses_[target][i]) { - const bool finished = mpi::Test (requests[i]); - requestStatuses[i] = !finished; + const bool finished = mpi::Test (coords_[coordIndex].requests_[target][i]); + coords_[coordIndex].statuses_[target][i] = !finished; } - if (!requestStatuses[i]) + if (!coords_[coordIndex].statuses_[target][i]) { - requestStatuses[i] = true; - return i; + coords_[coordIndex].statuses_[target][i] = true; + coords_[coordIndex].coord_[target][i][0] = i; + coords_[coordIndex].coord_[target][i][1] = j; + *cindex = coordIndex; + return i; } } - coord.resize (numCreated + 1); - requests.push_back (mpi::REQUEST_NULL); - requestStatuses.push_back (true); - + coords_[coordIndex].coord_[target].resize ( numCreated + 1 ); + coords_[coordIndex].coord_[target][numCreated][0] = i; + coords_[coordIndex].coord_[target][numCreated][1] = j; + coords_[coordIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); + coords_[coordIndex].statuses_[target].push_back ( true ); + *cindex = coordIndex; + return numCreated; - } +} -// nonblocking, local completion - template +template +void AxpyInterface2::Attach( DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Attach")) + // attached_ will be only set in Attach + // and only unset in Detach + if (!attached_) + attached_ = true; + else + LogicError("Must detach before reattaching."); + + const Grid& g = Z.Grid(); + const Int p = g.Size (); + + // the matrix base_ is not known until + // an update operation (put/get/acc) + // so it is kept blank + // if DistMatrix is non-const, all one-sided + // transfers -- put, get and acc are possible + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + { + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + + if ( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize(p); + mp.requests_.resize(p); + mp.statuses_.resize(p); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if ( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize(p); + cp.requests_.resize(p); + cp.statuses_.resize(p); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + } + } + + mpi::Barrier (g.VCComm()); +} + +// end-to-end blocking put/acc routines +template void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError("Submatrix out of bounds of global matrix"); const Grid& g = Y.Grid(); + + const Int XLDim = Z.LDim(); + + const Int height = Z.Height(); + const Int width = Z.Width(); + const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); + const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + T* XBuffer = Z.Buffer(); + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int YLDim = Y.LDim (); - + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int YLDim = Y.LDim(); + Int matrix_index, coord_index; + + // data/coord send for( Int step=0; step 0 ) + { + const Int destination = receivingRow + r*receivingCol; + // data const Int dindex = - NextIndexData (numEntries, - sendData_[destination], - sendDataRequests_[destination], - sendDataStatuses_[destination]); + NextIndexData (destination, + numEntries, + XBuffer, + &matrix_index); - DEBUG_ONLY (if - (Int (sendData_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndex");) + DEBUG_ONLY (if + (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != + numEntries) LogicError ("Error in NextIndexData");) + + T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + + for( Int t=0; t(sendCoord_[destination][cindex].data ()); - coord[0] = i; - coord[1] = j; - coord[2] = numEntries; - - mpi::TaggedISend (coord, 3, destination, - COORD_PUT_TAG, g.VCComm (), - sendCoordRequests_[destination][cindex]); + NextIndexCoord (i, j, + destination, + XBuffer, + &coord_index); + + Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + + // post receive for coordinates + mpi::TaggedISend (coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex]); } - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; } - // local completion - for (int i = 0; i < p; i++) - { - // coord sends - const Int numsendCoordRequests = sendCoordRequests_[i].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) - { - if (sendCoordStatuses_[i][j]) - mpi::Wait (sendCoordRequests_[i][j]); - sendCoordStatuses_[i][j] = false; - } + // poke + Test (Z); + + // data/coord receive + std::vector recvVector_; - // data recvs - const Int numrecvDataRequests = recvDataRequests_[i].size (); - for (Int j = 0; j < numrecvDataRequests; ++j) + for (Int step=0; step -void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iget")) - // a call to Attach with a non-const DistMatrix must set - // toBeAttachedForGet_ also, if not then it is assumed that - // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - DistMatrix& X = *GlobalArrayGet_; + // data + mpi::TaggedRecv (recvBuffer, count, source, + DATA_PUT_TAG, g.VCComm ()); - const Int height = Z.Height (); - const Int width = Z.Width (); - - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid submatrix for Iget"); + // Update Y + const T *XBuffer = recvBuffer; + const Int colAlign = (Y.ColAlign () + i) % r; + const Int rowAlign = (Y.RowAlign () + j) % c; - const Grid & g = X.Grid (); - const Int p = g.Size (); + const Int colShift = Shift (g.Row(), colAlign, r); + const Int rowShift = Shift (g.Col(), rowAlign, c); - // Send out the requests to all processes in the grid - for (Int rank = 0; rank < p; ++rank) - { - // modify get count - mpi::ReadInc (getrq_win_, 0, 1, rank); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); - // send coordinates - const Int cindex = - NextIndexCoord (sendCoord_[rank], - sendCoordRequests_[rank], - sendCoordStatuses_[rank]); + const Int iLocalOffset = Length (i, Y.ColShift (), r); + const Int jLocalOffset = Length (j, Y.RowShift (), c); - Int *coord = sendCoord_[rank][cindex].data (); - coord[0] = i; - coord[1] = j; - coord[2] = -1; - - mpi::TaggedISend (coord, 3, rank, - REQUEST_GET_TAG, g.VCComm (), - sendCoordRequests_[rank][cindex]); + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); + const T *XCol = &XBuffer[t * localHeight]; + MemCopy (YCol, XCol, localHeight); + } + } } + + // wait + Wait (Z); + + recvVector_.clear(); } -// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, -// where X is height x width template void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iacc")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; - + //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError("Submatrix out of bounds of global matrix"); const Grid& g = Y.Grid(); + + const Int XLDim = Z.LDim(); + + const Int height = Z.Height(); + const Int width = Z.Width(); + const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); + const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + T* XBuffer = Z.Buffer(); + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int YLDim = Y.LDim(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int YLDim = Y.LDim(); + + // data/coord receive + std::vector recvVector_; + Int matrix_index, coord_index; + // data/coord send for( Int step=0; step 0 ) + { + const Int destination = receivingRow + r*receivingCol; + // data + const Int dindex = + NextIndexData (destination, + numEntries, + XBuffer, + &matrix_index); - // send data - const Int dindex = - NextIndexData (numEntries, - sendData_[destination], - sendDataRequests_[destination], - sendDataStatuses_[destination]); + DEBUG_ONLY (if + (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != + numEntries) LogicError ("Error in NextIndexData");) + + T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - DEBUG_ONLY (if - (Int (sendData_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *sendBuffer = sendData_[destination][dindex].data (); for( Int t=0; t(sendCoord_[destination][cindex].data()); - coord[0] = i; - coord[1] = j; - coord[2] = numEntries; + Int * coord_ = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; - mpi::TaggedISend (coord, 3, destination, + // post receive for coordinates + mpi::TaggedISend (coord_, 3, destination, COORD_ACC_TAG, g.VCComm(), - sendCoordRequests_[destination][cindex]); + coords_[coord_index].requests_[destination][cindex]); } - - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; } - // local completion - for (int i = 0; i < p; i++) + + // test for requests + Test (Z); + + for (Int step=0; step +void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if ( !toBeAttachedForGet_ ) + LogicError ("Cannot perform this operation as matrix is not attached."); + DistMatrix& X = *GlobalArrayGet_; + + const Int height = Z.Height (); + const Int width = Z.Width (); + + if (i + height > X.Height () || j + width > X.Width ()) + LogicError ("Invalid submatrix for Iget"); + + T* XBuffer = Z.Buffer(); + const Grid & g = X.Grid (); + const Int p = g.Size (); + const Int r = g.Height (); + const Int c = g.Width (); + + Int coord_index; + std::vector recvVector_; + + // Send out the requests to all processes in the grid + for (Int rank = 0; rank < p; ++rank) + { + const Int cindex = + NextIndexCoord (i, j, + rank, + XBuffer, + &coord_index); + + Int *coord = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data ()); + coord[0] = i; + coord[1] = j; + coord[2] = -1; + + mpi::TaggedISend (coord, 3, rank, + REQUEST_GET_TAG, g.VCComm (), + coords_[coord_index].requests_[rank][cindex]); + } + + // Receive all of the replies + Int numReplies = 0; + while (numReplies < p) + { + mpi::Status status; + HandleGlobalToLocalData ( Z ); + if (mpi::IProbe + (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) { - if (recvDataStatuses_[i][j]) - mpi::Wait (recvDataRequests_[i][j]); - recvDataStatuses_[i][j] = false; + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount (status); + recvVector_.resize (count); + T *recvBuffer = recvVector_.data (); + + // Receive the data + mpi::TaggedRecv + (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); + + // Compute the local heights and offsets + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int colAlign = (X.ColAlign () + i) % r; + const Int rowAlign = (X.RowAlign () + j) % c; + const Int colShift = Shift (myRow, colAlign, r); + const Int rowShift = Shift (myCol, rowAlign, c); + const Int localHeight = Length (height, colShift, r); + const Int localWidth = Length (width, rowShift, c); + + // Unpack the local matrix + for (Int t = 0; t < localWidth; ++t) + { + T *YCol = X.Buffer (0, rowShift + t * c); + const T *XCol = &recvBuffer[t * localHeight]; + for (Int s = 0; s < localHeight; ++s) + YCol[colShift + s * r] = XCol[s]; + } + ++numReplies; + recvVector_.clear(); } } } @@ -548,6 +736,7 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; const Int XLDim = Z.LDim(); + Int matrix_index, coord_index; // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); @@ -556,6 +745,7 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) Int receivingCol = myProcessCol; const Int YLDim = Y.LDim (); + T* XBuffer = Z.Buffer(); for( Int step=0; step::Iput( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const T* XBuffer = Z.LockedBuffer(); - // put count - mpi::ReadInc (put_win_, 0, 1, destination); - + const Int dindex = - NextIndexData (numEntries, - sendData_[destination], - sendDataRequests_[destination], - sendDataStatuses_[destination]); - + NextIndexData (destination, + numEntries, + XBuffer, + &matrix_index); + DEBUG_ONLY (if - (Int (sendData_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndex");) + (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != + numEntries) LogicError ("Error in NextIndexData");) + + T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - T *sendBuffer = sendData_[destination][dindex].data (); - for( Int t=0; t(sendCoord_[destination][cindex].data ()); - coord[0] = i; - coord[1] = j; - coord[2] = numEntries; - - mpi::TaggedISend (coord, 3, destination, - COORD_PUT_TAG, g.VCComm (), - sendCoordRequests_[destination][cindex]); - } + NextIndexCoord (i, j, + destination, + XBuffer, + &coord_index); + + Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + + // post receive for coordinates + mpi::TaggedISend (coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex]); + } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -631,6 +822,9 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) const Int height = Z.Height (); const Int width = Z.Width (); + T* XBuffer = Z.Buffer(); + + Int coord_index; if (i + height > X.Height () || j + width > X.Width ()) LogicError ("Invalid submatrix for Iget"); @@ -641,23 +835,22 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) // Send out the requests to all processes in the grid for (Int rank = 0; rank < p; ++rank) { - // modify get count - mpi::ReadInc (getrq_win_, 0, 1, rank); - // send coordinates const Int cindex = - NextIndexCoord (sendCoord_[rank], - sendCoordRequests_[rank], - sendCoordStatuses_[rank]); - - Int *coord = sendCoord_[rank][cindex].data (); - coord[0] = i; - coord[1] = j; - coord[2] = -1; - - mpi::TaggedISend (coord, 3, rank, - REQUEST_GET_TAG, g.VCComm (), - sendCoordRequests_[rank][cindex]); + NextIndexCoord (i, j, + rank, + XBuffer, + &coord_index); + + Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data()); + coord_[0] = i; + coord_[1] = j; + coord_[2] = -1; + + // post receive for coordinates + mpi::TaggedISend (coord_, 3, rank, + REQUEST_GET_TAG, g.VCComm(), + coords_[coord_index].requests_[rank][cindex]); } } @@ -674,6 +867,7 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; + Int matrix_index, coord_index; //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) @@ -696,7 +890,8 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int YLDim = Y.LDim(); + const Int YLDim = Y.LDim(); + T* XBuffer = Z.Buffer(); for( Int step=0; step::Iacc( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const T* XBuffer = Z.LockedBuffer(); - // acc count - mpi::ReadInc (acc_win_, 0, 1, destination); - - // send data - const Int dindex = - NextIndexData (numEntries, - sendData_[destination], - sendDataRequests_[destination], - sendDataStatuses_[destination]); - + const Int dindex = + NextIndexData (destination, + numEntries, + XBuffer, + &matrix_index); + DEBUG_ONLY (if - (Int (sendData_[destination][dindex].size ()) != + (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != numEntries) LogicError ("Error in NextIndexData");) - T *sendBuffer = sendData_[destination][dindex].data (); + T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); for( Int t=0; t::Iacc( Matrix& Z, Int i, Int j ) for( Int s=0; s(sendCoord_[destination][cindex].data()); - coord[0] = i; - coord[1] = j; - coord[2] = numEntries; + NextIndexCoord (i, j, + destination, + XBuffer, + &coord_index); + + Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; - mpi::TaggedISend (coord, 3, destination, + mpi::TaggedISend (coord_, 3, destination, COORD_ACC_TAG, g.VCComm(), - sendCoordRequests_[destination][cindex]); + coords_[coord_index].requests_[destination][cindex]); } receivingRow = (receivingRow + 1) % r; @@ -760,103 +952,289 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) } } +// waitny implementation +// cannot use mpi::Waitany +// as of now because request +// objects are vector of deques template -void AxpyInterface2::WaitRequests( Matrix& Z ) +void AxpyInterface2::WaitAny( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::WaitRequests")) - - DistMatrix& Y = *GlobalArrayPut_; + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::WaitAny")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); - const Int p = g.Size (); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + + const T *base_address = Z.LockedBuffer(); - for (Int i = 0; i < p; ++i) + // search for matrix base + for (Int m = 0; m < numMatrices; m++) { - // coord recvs - const Int numrecvCoordRequests = recvCoordRequests_[i].size (); - for (Int j = 0; j < numrecvCoordRequests; ++j) - { - if (recvCoordStatuses_[i][j]) - mpi::Wait (recvCoordRequests_[i][j]); - recvCoordStatuses_[i][j] = false; - } - - // coord sends - const Int numsendCoordRequests = sendCoordRequests_[i].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) - { - if (sendCoordStatuses_[i][j]) - mpi::Wait (sendCoordRequests_[i][j]); - sendCoordStatuses_[i][j] = false; - } - - // data recvs - const Int numrecvDataRequests = recvDataRequests_[i].size (); - for (Int j = 0; j < numrecvDataRequests; ++j) + if ( matrices_[m].base_ == base_address ) { - if (recvDataStatuses_[i][j]) - mpi::Wait (recvDataRequests_[i][j]); - recvDataStatuses_[i][j] = false; + matrixIndex = m; + break; } + matrixIndex = m+1; + } - // data sends - const Int numsendDataRequests = sendDataRequests_[i].size (); - for (Int j = 0; j < numsendDataRequests; ++j) + // search for matrix base in coords + for (Int c = 0; c < numCoords; c++) + { + if ( coords_[c].base_ == base_address ) { - if (sendDataStatuses_[i][j]) - mpi::Wait (sendDataRequests_[i][j]); - sendDataStatuses_[i][j] = false; + coordIndex = c; + break; } + coordIndex = c+1; } -} -template -bool AxpyInterface2::TestRequests( Matrix& Z ) + // matrix not found + if ( matrixIndex == numMatrices && + coordIndex == numCoords) + return; + + // data + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + if (!matrices_[matrixIndex].statuses_[rank][i]) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + return; + } + else + continue; + } + } + + // coordinates + for (int rank = 0; rank < p; ++rank) + { + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for (int i = 0; i < numCoordStatuses; i++) + { + if (!coords_[coordIndex].statuses_[rank][i]) + { + mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; + return; + } + else + continue; + } + } +} + +template +void AxpyInterface2::Wait( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::TestRequests")) - - DistMatrix& Y = *GlobalArrayPut_; + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Wait")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); - const Int p = g.Size (); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + + const T *base_address = Z.LockedBuffer(); - for (Int i = 0; i < p; ++i) + // search for matrix base + for (Int m = 0; m < numMatrices; m++) { - // coord recvs - const Int numrecvCoordRequests = recvCoordRequests_[i].size (); - for (Int j = 0; j < numrecvCoordRequests; ++j) + if ( matrices_[m].base_ == base_address ) { - recvCoordStatuses_[i][j] = - !mpi::Test (recvCoordRequests_[i][j]); - if (recvCoordStatuses_[i][j]) - return false; - } + matrixIndex = m; + break; + } + matrixIndex = m+1; + } + + // search for matrix base in coords + for (Int c = 0; c < numCoords; c++) + { + if ( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + coordIndex = c+1; + } + + // matrix not found + if ( matrixIndex == numMatrices && + coordIndex == numCoords) + return; + + // data + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; - // coord sends - const Int numsendCoordRequests = sendCoordRequests_[i].size (); - for (Int j = 0; j < numsendCoordRequests; ++j) + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) { - sendCoordStatuses_[i][j] = - !mpi::Test (sendCoordRequests_[i][j]); - if (sendCoordStatuses_[i][j]) - return false; + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } + + // coordinates + for (int rank = 0; rank < p; ++rank) + { + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for (int i = 0; i < numCoordStatuses; i++) + { + mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; } + } +} + +template +void AxpyInterface2::Waitall () +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Waitall")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + DistMatrix& Y = *GlobalArrayGet_; + Int matrixIndex, coordIndex; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); - // data recvs - const Int numrecvDataRequests = recvDataRequests_[i].size (); - for (Int j = 0; j < numrecvDataRequests; ++j) + // data + for (int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex) + { + for (int rank = 0; rank < p; ++rank) { - recvDataStatuses_[i][j] = - !mpi::Test (recvDataRequests_[i][j]); - if (recvDataStatuses_[i][j]) - return false; + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } + } + + // coordinates + for (int coordIndex = 0; coordIndex < numCoords; ++coordIndex) + { + for (int rank = 0; rank < p; ++rank) + { + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for (int i = 0; i < numCoordStatuses; i++) + { + mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; + } } + } +} + +template +bool AxpyInterface2::Test( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Test")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + + const T *base_address = Z.LockedBuffer(); - // data sends - const Int numsendDataRequests = sendDataRequests_[i].size (); - for (Int j = 0; j < numsendDataRequests; ++j) + // search for matrix base + for (Int m = 0; m < numMatrices; m++) + { + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + matrixIndex = m+1; + } + // search for matrix base in coords + for (Int c = 0; c < numCoords; c++) + { + if ( coords_[c].base_ == base_address ) { - sendDataStatuses_[i][j] = - !mpi::Test (sendDataRequests_[i][j]); - if (sendDataStatuses_[i][j]) + coordIndex = c; + break; + } + coordIndex = c+1; + } + + // matrix not found + if ( matrixIndex == numMatrices && + coordIndex == numCoords) + return true; + + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); + if (matrices_[matrixIndex].statuses_[rank][i]) + return false; + } + } + + for (int rank = 0; rank < p; ++rank) + { + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for (int i = 0; i < numCoordStatuses; i++) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + if (coords_[coordIndex].statuses_[rank][i]) return false; } } @@ -864,82 +1242,167 @@ bool AxpyInterface2::TestRequests( Matrix& Z ) return true; } -// flush ensures local and remote completion -// this interface assumes a send has been issued -// and will post a matching receive and progress - template -void AxpyInterface2::Flush( Matrix& Z ) +// TODO Use mpi::Testany instead of mpi::Test +// at present request object is vector +// of deques, so cannot convert it to +// an array required by Testany +template +bool AxpyInterface2::TestAny( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - - const Int height = Z.Height (); - const Int width = Z.Width (); - - DistMatrix& Y = *GlobalArrayPut_; + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::TestAny")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); - const Int p = g.Size (); - const Int me = g.VCRank(); - const Int r = g.Height (); - const Int c = g.Width (); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + + const T *base_address = Z.LockedBuffer(); - mpi::Flush (put_win_); - mpi::Flush (acc_win_); - mpi::Flush (getrq_win_); + // search for matrix base + for (Int m = 0; m < numMatrices; m++) + { + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + matrixIndex = m+1; + } + // search for matrix base in coords + for (Int c = 0; c < numCoords; c++) + { + if ( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + coordIndex = c+1; + } - // get my put/get/acc recv counts - const Int put_count = mpi::ReadInc (put_win_, 0, 0, me); - const Int acc_count = mpi::ReadInc (acc_win_, 0, 0, me); - const Int getrq_count = mpi::ReadInc (getrq_win_, 0, 0, me); + // matrix not found + if ( matrixIndex == numMatrices && + coordIndex == numCoords) + return true; - for (Int count = 0; count < put_count; ++count) + for (int rank = 0; rank < p; ++rank) { - mpi::Status status; - if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_PUT_TAG, g.VCComm(), status) ) - HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); + if (matrices_[matrixIndex].statuses_[rank][i]) + continue; + else + return true; + } } - - for (Int count = 0; count < acc_count; ++count) + + for (int rank = 0; rank < p; ++rank) { - mpi::Status status; - if ( mpi::IProbe (mpi::ANY_SOURCE, DATA_ACC_TAG, g.VCComm(), status) ) - HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for (int i = 0; i < numCoordStatuses; i++) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + if (coords_[coordIndex].statuses_[rank][i]) + continue; + else + return true; + } } + + return false; +} - for (Int count = 0; count < getrq_count; ++count) - HandleGlobalToLocalData ( Z ); - - // wait for all requests - coords and data - WaitRequests (Z); +template +bool AxpyInterface2::Testall() +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Testall")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int p = g.Size(); - // flush counts - mpi::ReadInc (put_win_, 0, -put_count, me); - mpi::ReadInc (acc_win_, 0, -acc_count, me); - mpi::ReadInc (getrq_win_, 0, -getrq_count, me); + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + + // data + for (int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex) + { + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); + if (matrices_[matrixIndex].statuses_[rank][i]) + return false; + } + } + } + + // coordinates + for (int coordIndex = 0; coordIndex < numCoords; ++coordIndex) + { + for (int rank = 0; rank < p; ++rank) + { + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for (int i = 0; i < numCoordStatuses; i++) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + if (coords_[coordIndex].statuses_[rank][i]) + return false; + } + } + } + + return true; } -// This is collective flush, this requires all PEs -// to invoke the Ibarrier, incorrect usage will lead -// to deadlocks +// This is non-collective flush +// This will ensure local+remote completion template -void AxpyInterface2::Cflush( Matrix& Z ) +void AxpyInterface2::Flush( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Cflush")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); - + DistMatrix& Y = *GlobalArrayPut_; const Grid& g = Y.Grid(); - mpi::Status status; - + mpi::Status status; bool DONE = false; - mpi::Request nb_bar_request; - bool nb_bar_active = false; while ( !DONE ) - { + { // similar to HandleXYZ functions in original AxpyInterface if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) { @@ -947,7 +1410,7 @@ void AxpyInterface2::Cflush( Matrix& Z ) { case DATA_PUT_TAG: { - HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); + HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); } case DATA_ACC_TAG: { @@ -959,19 +1422,11 @@ void AxpyInterface2::Cflush( Matrix& Z ) } } } - if ( nb_bar_active ) - { - DONE = mpi::Test ( nb_bar_request ); - } - else - { - // all sends (data or request) are complete - if ( TestRequests( Z ) ) - { - mpi::IBarrier ( g.VCComm(), nb_bar_request ); - nb_bar_active = true; - } - } + + // wait for requests to + // complete one by one + WaitAny (Z); + DONE = Test (Z); } } @@ -1109,6 +1564,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int myCol = g.Col(); Int i, j; + Int matrix_index; std::vector recvVector_; const Int XLDim = Z.LDim(); @@ -1147,18 +1603,19 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) DEBUG_ONLY (if (numEntries < Int (sizeof (T))) LogicError ("Count was too small");) - const Int index = - NextIndexData (numEntries, - sendData_[source], - sendDataRequests_[source], - sendDataStatuses_[source]); + T* XBuffer = Z.Buffer(); + const Int index = + NextIndexData (source, + numEntries, + XBuffer, + &matrix_index); DEBUG_ONLY (if - (Int (sendData_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - - T *replyBuffer = sendData_[source][index].data (); + (Int (matrices_[matrix_index].data_[source][index].size ()) != + numEntries) LogicError ("Error in NextIndexData");) + T *replyBuffer = matrices_[matrix_index].data_[source][index].data (); + for (Int t = 0; t < localWidth; ++t) { T *sendCol = &replyBuffer[t * localHeight]; @@ -1169,7 +1626,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) // Fire off non-blocking send mpi::TaggedISend (replyBuffer, numEntries, source, DATA_GET_TAG, g.VCComm (), - sendDataRequests_[source][index]); + matrices_[matrix_index].requests_[source][index]); } // receive data @@ -1209,382 +1666,6 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) recvVector_.clear(); } - -// end-to-end blocking update routines -template -void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eput")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); - - DistMatrix& Y = *GlobalArrayPut_; - - //do boundary checks - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); - - const Grid& g = Y.Grid(); - - const Int XLDim = Z.LDim(); - - const Int height = Z.Height(); - const Int width = Z.Width(); - - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - - const T* XBuffer = Z.LockedBuffer(); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int YLDim = Y.LDim(); - - // data/coord send - for( Int step=0; step 0 ) - { - const Int destination = receivingRow + r*receivingCol; - // data - const Int index = - NextIndexData (numEntries, - sendData_[destination], - sendDataRequests_[destination], - sendDataStatuses_[destination]); - - DEBUG_ONLY (if - (Int (sendData_[destination][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - - T *sendBuffer = sendData_[destination][index].data (); - - for( Int t=0; t recvVector_; - - for (Int step=0; step -void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eacc")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); - - DistMatrix& Y = *GlobalArrayPut_; - - //do boundary checks - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); - - const Grid& g = Y.Grid(); - - const Int XLDim = Z.LDim(); - - const Int height = Z.Height(); - const Int width = Z.Width(); - - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - - const T* XBuffer = Z.LockedBuffer(); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int YLDim = Y.LDim(); - - // data/coord receive - std::vector recvVector_; - - // data/coord send - for( Int step=0; step 0 ) - { - const Int destination = receivingRow + r*receivingCol; - // data - const Int index = - NextIndexData (numEntries, - sendData_[destination], - sendDataRequests_[destination], - sendDataStatuses_[destination]); - - DEBUG_ONLY (if - (Int (sendData_[destination][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - - T *sendBuffer = sendData_[destination][index].data (); - - for( Int t=0; t @@ -1611,33 +1692,8 @@ void AxpyInterface2::Detach() GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - sendDataStatuses_.clear (); - sendCoordStatuses_.clear (); - recvDataStatuses_.clear (); - recvCoordStatuses_.clear (); - - sendDataRequests_.clear (); - sendCoordRequests_.clear (); - recvDataRequests_.clear (); - recvCoordRequests_.clear (); - - sendData_.clear (); - sendCoord_.clear (); - recvData_.clear (); - recvCoord_.clear (); - - mpi::WindowUnlock (put_win_); - mpi::WindowFree (put_win_); - - mpi::WindowUnlock (acc_win_); - mpi::WindowFree (acc_win_); - - mpi::WindowUnlock (getrq_win_); - mpi::WindowFree (getrq_win_); - - delete put_win_base_; - delete acc_win_base_; - delete getrq_win_base_; + matrices_.clear(); + coords_.clear(); } template class AxpyInterface2; diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 9c5c79a8af..4898b5c61d 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -1386,52 +1386,71 @@ void IBarrier (Comm comm, Request & request) // Test for completion bool Test (Request & request) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Test")) Status - status; + DEBUG_ONLY (CallStackEntry cse ("mpi::Test")) + + Status status; int flag; SafeMpi (MPI_Test (&request, &flag, &status)); - return flag; + if (flag) + return true; + else + return false; } bool Test (Request & request, Status & status) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Test")) int - flag; + DEBUG_ONLY (CallStackEntry cse ("mpi::Test")) + int flag; SafeMpi (MPI_Test (&request, &flag, &status)); - return flag; + + if (flag) + return true; + else + return false; } bool Testany (int count, Request * requests, int &indx, Status & status) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) int - flag; + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) + int flag; SafeMpi (MPI_Testany (count, requests, &indx, &flag, &status)); - return flag; + if (flag) + return true; + else + return false; } bool Testany (int count, Request * requests, int &indx) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) int - flag; + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) + + int flag; Status status; SafeMpi (MPI_Testany - (count, requests, &indx, &flag, &status)); - return flag; + (count, requests, &indx, &flag, &status)); + if (flag) + return true; + else + return false; } bool Testany (int count, Request * requests) { - DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) int - flag, indx; + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) + + int flag, indx; Status status; SafeMpi (MPI_Testany (count, requests, &indx, &flag, &status)); - return flag; + if (flag) + return true; + else + return false; } // Ensure that the request finishes before continuing From 01a44a278650c66f9f2db5f88286e32998f60877 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 11 Feb 2015 13:42:15 -0800 Subject: [PATCH 101/110] modified locally blocking routines Acc/Put - memcpying input Z to an intermediate buffer until I find a clever solution to achieve mpi3_flush_local; updated flush to a noncollective version, introduce waitany function --- include/El/core/AxpyInterface2.0.hpp | 24 +- src/core/AxpyInterface2.0.cpp | 355 +++++++++++++++++++++------ 2 files changed, 304 insertions(+), 75 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 87c683d0e2..2ee538b6f0 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -36,14 +36,21 @@ class AxpyInterface2 void Iacc( const Matrix& Z, Int i, Int j ); // locally blocking update routines - // currently not implemented + // reuse input buffer when returns + void Acc( Matrix& Z, Int i, Int j ); + void Acc( const Matrix& Z, Int i, Int j ); + void Put( Matrix& Z, Int i, Int j ); - void Put( const Matrix& Z, Int i, Int j ); + void Put( const Matrix& Z, Int i, Int j ); - void Get( Matrix& Z, Int i, Int j ); + // End to End blocking + void Eacc( Matrix& Z, Int i, Int j ); + void Eacc( const Matrix& Z, Int i, Int j ); - void Acc( Matrix& Z, Int i, Int j ); - void Acc( const Matrix& Z, Int i, Int j ); + void Eput( Matrix& Z, Int i, Int j ); + void Eput( const Matrix& Z, Int i, Int j ); + + void Get( Matrix& Z, Int i, Int j ); // synchronization routines void Flush( Matrix& Z ); @@ -87,6 +94,13 @@ class AxpyInterface2 std::vector coords_; + // for blocking interface + // copying input buffer in this + // intermediate buffer so that input + // buffer could be reused + std::vector>> + dataVectors_; + // TODO need to add const here... DistMatrix* GlobalArrayPut_; DistMatrix* GlobalArrayGet_; diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 0aad2e4367..445c4ab363 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -6,56 +6,31 @@ which can be found in the LICENSE file in the root directory, or at #include "El-lite.hpp" #include -#if MPI_VERSION>=3 + // TODO bring back const interfaces namespace El { template AxpyInterface2::AxpyInterface2() : GlobalArrayPut_(0), GlobalArrayGet_(0), - matrices_(0), coords_(0), + matrices_(0), coords_(0), dataVectors_(0), toBeAttachedForGet_(false), toBeAttachedForPut_(false), - attached_(false), detached_(false) + attached_(false), detached_(true) { } template AxpyInterface2::AxpyInterface2( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::AxpyInterface2")) - + attached_ = false; - detached_ = false; - toBeAttachedForGet_ = true; - toBeAttachedForPut_ = true; - GlobalArrayPut_ = &Z; - GlobalArrayGet_ = &Z; - - const Grid& g = Z.Grid(); - const Int p = g.Size (); + detached_ = true; - if ( matrices_.empty() ) - { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - } - - if ( coords_.empty() ) - { - struct coord_params_ cp; - cp.coord_.resize(p); - cp.requests_.resize(p); - cp.statuses_.resize(p); - cp.base_ = NULL; - // push back new matrix_params created - // with default constructor - coords_.push_back( cp ); - } + toBeAttachedForGet_ = false; + toBeAttachedForPut_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; } template @@ -252,14 +227,17 @@ void AxpyInterface2::Attach( DistMatrix& Z ) DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Attach")) // attached_ will be only set in Attach // and only unset in Detach - if (!attached_) + if (!attached_ && detached_) + { attached_ = true; + detached_ = false; + } else LogicError("Must detach before reattaching."); const Grid& g = Z.Grid(); const Int p = g.Size (); - + // the matrix base_ is not known until // an update operation (put/get/acc) // so it is kept blank @@ -271,7 +249,9 @@ void AxpyInterface2::Attach( DistMatrix& Z ) toBeAttachedForPut_ = true; GlobalArrayGet_ = &Z; toBeAttachedForGet_ = true; - + + dataVectors_.resize(p); + if ( matrices_.empty() ) { struct matrix_params_ mp; @@ -302,9 +282,9 @@ void AxpyInterface2::Attach( DistMatrix& Z ) // end-to-end blocking put/acc routines template -void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eput")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -461,10 +441,11 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) recvVector_.clear(); } +// end to end blocking routines template -void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eacc")) if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); @@ -952,7 +933,241 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) } } -// waitny implementation +// nonblocking, local completion +template +void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + Int matrix_index, coord_index; + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim (); + + // copy local matrix buffer + const Int my_rank = g.VCRank(); + const Int numCreated = dataVectors_[my_rank].size (); + + dataVectors_[my_rank].resize (numCreated + 1); + dataVectors_[my_rank][numCreated].resize (width * height); + + // TODO Buffer could be const, but at present + // NextIndexMatrix only supports non-const + // pointers + T* Buffer = Z.Buffer(); + T* ZBuffer = reinterpret_cast < T * >(dataVectors_[my_rank][numCreated].data()); + + MemCopy (ZBuffer, Buffer, height * width); + T* XBuffer = reinterpret_cast < T * >(ZBuffer); + + for( Int step=0; step(coords_[coord_index].coord_[destination][cindex].data()); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + + mpi::TaggedISend (coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex]); + } + + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } +} + +// input buffer could be modified upon exit +// from this function + +// as it turns out, it is perhaps impossible +// to ensure local completion using mpi-2, +// hence we memcpy the input buffer so that +// local buffer could be updated when this +// function returns +template +void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) + + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); + + DistMatrix& Y = *GlobalArrayPut_; + Int matrix_index, coord_index; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = (Y.ColAlign() + i) % r; + const Int rowAlign = (Y.RowAlign() + j) % c; + + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const Int YLDim = Y.LDim(); + + // copy local matrix buffer + const Int my_rank = g.VCRank(); + const Int numCreated = dataVectors_[my_rank].size (); + + dataVectors_[my_rank].resize (numCreated + 1); + dataVectors_[my_rank][numCreated].resize (width * height); + + // TODO Buffer could be const, but at present + // NextIndexMatrix only supports non-const + // pointers + T* Buffer = Z.Buffer(); + T* ZBuffer = reinterpret_cast < T * >(dataVectors_[my_rank][numCreated].data()); + + MemCopy (ZBuffer, Buffer, height * width); + T* XBuffer = reinterpret_cast < T * >(ZBuffer); + + for( Int step=0; step(coords_[coord_index].coord_[destination][cindex].data()); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + + mpi::TaggedISend (coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex]); + } + + receivingRow = (receivingRow + 1) % r; + if( receivingRow == 0 ) + receivingCol = (receivingCol + 1) % c; + } +} + +// waitany implementation // cannot use mpi::Waitany // as of now because request // objects are vector of deques @@ -1016,8 +1231,6 @@ void AxpyInterface2::WaitAny( Matrix& Z ) matrices_[matrixIndex].statuses_[rank][i] = true; return; } - else - continue; } } @@ -1037,8 +1250,6 @@ void AxpyInterface2::WaitAny( Matrix& Z ) coords_[coordIndex].statuses_[rank][i] = true; return; } - else - continue; } } } @@ -1397,35 +1608,36 @@ void AxpyInterface2::Flush( Matrix& Z ) DistMatrix& Y = *GlobalArrayPut_; const Grid& g = Y.Grid(); - - mpi::Status status; + bool DONE = false; + mpi::Status status; while ( !DONE ) - { - // similar to HandleXYZ functions in original AxpyInterface + { if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) { - switch (status.MPI_TAG) - { - case DATA_PUT_TAG: - { - HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); - } - case DATA_ACC_TAG: - { - HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); - } - case REQUEST_GET_TAG: - { - HandleGlobalToLocalData ( Z ); - } - } + switch (status.MPI_TAG) + { + case DATA_PUT_TAG: + { + HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); + break; + } + case REQUEST_GET_TAG: + { + HandleGlobalToLocalData ( Z ); + break; + } + } } - // wait for requests to // complete one by one - WaitAny (Z); + WaitAny (Z); DONE = Test (Z); } } @@ -1463,7 +1675,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int source ) T *getBuffer = getVector_.data(); mpi::TaggedRecv (getBuffer, count, source, DATA_PUT_TAG, g.VCComm()); - + // Update Y const T *XBuffer = reinterpret_cast < const T * >(getBuffer); const Int colAlign = (Y.ColAlign() + i) % r; @@ -1482,6 +1694,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int source ) const T *XCol = &XBuffer[t * localHeight]; MemCopy (YCol, XCol, localHeight); } + // Free the memory getVector_.clear(); } @@ -1682,6 +1895,7 @@ void AxpyInterface2::Detach() GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); + const Int p = g.Size(); mpi::Barrier( g.VCComm() ); attached_ = false; @@ -1692,6 +1906,8 @@ void AxpyInterface2::Detach() GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; + dataVectors_.clear(); + matrices_.clear(); coords_.clear(); } @@ -1703,4 +1919,3 @@ template class AxpyInterface2>; template class AxpyInterface2>; } // namespace El -#endif From 5e96051f5216206d22d48424c33044bdbd344a15 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Fri, 13 Feb 2015 12:52:22 -0800 Subject: [PATCH 102/110] added const interfaces, some new functions for rmainterface for request based rma --- include/El/core/AxpyInterface2.0.hpp | 26 +- include/El/core/RmaInterface.hpp | 61 +- src/core/AxpyInterface2.0.cpp | 379 ++++++++--- src/core/RmaInterface.cpp | 938 +++++++++++++++++---------- 4 files changed, 936 insertions(+), 468 deletions(-) diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index 2ee538b6f0..d0b63845a2 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -44,6 +44,7 @@ class AxpyInterface2 void Put( const Matrix& Z, Int i, Int j ); // End to End blocking + // will be deprecated soon void Eacc( Matrix& Z, Int i, Int j ); void Eacc( const Matrix& Z, Int i, Int j ); @@ -69,7 +70,7 @@ class AxpyInterface2 // struct for passing data struct matrix_params_ { - T *base_; + const void *base_; std::vector>> data_; std::vector> @@ -83,7 +84,7 @@ class AxpyInterface2 // struct for passing coordinates struct coord_params_ { - T *base_; + const void *base_; std::vector>> coord_; std::vector> @@ -101,10 +102,9 @@ class AxpyInterface2 std::vector>> dataVectors_; - // TODO need to add const here... - DistMatrix* GlobalArrayPut_; - DistMatrix* GlobalArrayGet_; - + DistMatrix* GlobalArrayPut_; + const DistMatrix* GlobalArrayGet_; + bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; @@ -112,34 +112,34 @@ class AxpyInterface2 Int NextIndexData ( Int target, Int dataSize, - T * base_address, + const void* base_address, Int *mindex); Int NextIndexCoord ( Int i, Int j, Int target, - T * base_address, + const void* base_address, Int *cindex); bool Testall(); - bool Test( Matrix& Z ); - bool Test( const Matrix& Z ); + bool Test( Matrix& Z ); + bool Test( const Matrix& Z ); bool TestAny( Matrix& Z ); bool TestAny( const Matrix& Z ); void Waitall(); - void Wait( Matrix& Z ); - void Wait( const Matrix& Z ); + void Wait( Matrix& Z ); + void Wait( const Matrix& Z ); void WaitAny( Matrix& Z ); void WaitAny( const Matrix& Z ); // these are only used for nonblocking // update rountines void HandleGlobalToLocalData( Matrix& Z ); + void HandleLocalToGlobalData( Matrix& Z, Int source ); void HandleLocalToGlobalAcc( Matrix& Z, Int source ); - void HandleGlobalToLocalData( const Matrix& Z ); void HandleLocalToGlobalData( const Matrix& Z, Int source ); void HandleLocalToGlobalAcc( const Matrix& Z, Int source ); }; diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 432125395a..92e6f442f9 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -46,16 +46,42 @@ class RmaInterface void Iacc( Matrix& Z, Int i, Int j ); void Iacc( const Matrix& Z, Int i, Int j ); - void Flush( Matrix& Z, Int i, Int j ); - void Flush( const Matrix& Z, Int i, Int j ); - void Flush( Matrix& Z ); - void Flush( const Matrix& Z ); + // Request based RMA + void Rput( Matrix& Z, Int i, Int j ); + void Rput( const Matrix& Z, Int i, Int j ); + + void Racc( Matrix& Z, Int i, Int j ); + void Racc( const Matrix& Z, Int i, Int j ); + + // Synchronization routines + void Flush( Matrix& Z ); + void Flush( const Matrix& Z ); + void LocalFlush( const Matrix& Z ); + void LocalFlush( Matrix& Z ); + void LocalFlush(); void Detach(); private: + mpi::Window window; + // struct for passing data + // for request based rma + struct matrix_params_ + { + const void *base_; + std::vector>> + data_; + std::vector> + requests_; + std::vector> + statuses_; + }; + + std::vector matrices_; + + // buffers for rma std::vector>> getVector_, putVector_; @@ -65,8 +91,33 @@ class RmaInterface bool toBeAttachedForPut_, toBeAttachedForGet_, attached_, detached_; - Int NextIndex ( Int dataSize, + // next index for data + Int NextIndex ( + Int dataSize, std::deque > &dataVectors ); + + Int NextIndex ( + Int target, + Int dataSize, + const void* base_address, + Int* mindex); + + // only relevant for request-based + // passive RMA + bool anyPendingXfers ( Matrix& Z ); + bool anyPendingXfers ( const Matrix& Z ); + + bool Testall(); + bool Test( Matrix& Z ); + bool Test( const Matrix& Z ); + bool TestAny( Matrix& Z ); + bool TestAny( const Matrix& Z ); + + void Waitall(); + void Wait( Matrix& Z ); + void Wait( const Matrix& Z ); + void WaitAny( Matrix& Z ); + void WaitAny( const Matrix& Z ); }; #endif //MPI-3 } // namespace El diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 445c4ab363..548c6f3fa6 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -6,8 +6,6 @@ which can be found in the LICENSE file in the root directory, or at #include "El-lite.hpp" #include - -// TODO bring back const interfaces namespace El { template @@ -33,6 +31,21 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) GlobalArrayGet_ = 0; } +template +AxpyInterface2::AxpyInterface2( const DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::AxpyInterface2")) + + attached_ = false; + detached_ = true; + + toBeAttachedForGet_ = false; + toBeAttachedForPut_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; +} + template AxpyInterface2::~AxpyInterface2() { @@ -58,7 +71,7 @@ template Int AxpyInterface2::NextIndexData ( Int target, Int dataSize, - T * base_address, + const void* base_address, Int *mindex) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexData")) @@ -66,8 +79,9 @@ Int AxpyInterface2::NextIndexData ( assert ( base_address != NULL ); Int matrixIndex = 0; - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); const Int numMatrices = matrices_.size(); @@ -141,7 +155,7 @@ template Int AxpyInterface2::NextIndexCoord ( Int i, Int j, Int target, - T * base_address, + const void* base_address, Int *cindex) { DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexCoord")) @@ -149,8 +163,9 @@ Int AxpyInterface2::NextIndexCoord ( assert ( base_address != NULL ); Int coordIndex = 0; - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); const Int numCoords = coords_.size(); @@ -250,7 +265,65 @@ void AxpyInterface2::Attach( DistMatrix& Z ) GlobalArrayGet_ = &Z; toBeAttachedForGet_ = true; - dataVectors_.resize(p); + if ( dataVectors_.empty() ) + dataVectors_.resize(p); + + if ( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize(p); + mp.requests_.resize(p); + mp.statuses_.resize(p); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if ( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize(p); + cp.requests_.resize(p); + cp.statuses_.resize(p); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + } + } + + mpi::Barrier (g.VCComm()); +} + +template +void AxpyInterface2::Attach( const DistMatrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Attach")) + // attached_ will be only set in Attach + // and only unset in Detach + if (!attached_ && detached_) + { + attached_ = true; + detached_ = false; + } + else + LogicError("Must detach before reattaching."); + + const Grid& g = Z.Grid(); + const Int p = g.Size (); + + // the matrix base_ is not known until + // an update operation (put/get/acc) + // so it is kept blank + // if DistMatrix is non-const, all one-sided + // transfers -- put, get and acc are possible + if( !toBeAttachedForGet_ ) + { + GlobalArrayPut_ = 0; + toBeAttachedForPut_ = false; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; if ( matrices_.empty() ) { @@ -282,7 +355,7 @@ void AxpyInterface2::Attach( DistMatrix& Z ) // end-to-end blocking put/acc routines template -void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Eput( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eput")) @@ -311,8 +384,9 @@ void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - T* XBuffer = Z.Buffer(); - + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -340,7 +414,7 @@ void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) const Int dindex = NextIndexData (destination, numEntries, - XBuffer, + Buffer, &matrix_index); DEBUG_ONLY (if @@ -365,7 +439,7 @@ void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) const Int cindex = NextIndexCoord (i, j, destination, - XBuffer, + Buffer, &coord_index); Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); @@ -441,9 +515,15 @@ void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) recvVector_.clear(); } +template +void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) +{ + Eput( const_cast&>(Z), i, j ); +} + // end to end blocking routines template -void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Eacc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eacc")) @@ -472,7 +552,8 @@ void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - T* XBuffer = Z.Buffer(); + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -504,7 +585,7 @@ void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) const Int dindex = NextIndexData (destination, numEntries, - XBuffer, + Buffer, &matrix_index); DEBUG_ONLY (if @@ -529,7 +610,7 @@ void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) const Int cindex = NextIndexCoord (i, j, destination, - XBuffer, + Buffer, &coord_index); Int * coord_ = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); @@ -603,6 +684,12 @@ void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) recvVector_.clear(); } +template +void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) +{ + Eacc( const_cast&>(Z), i, j ); +} + template void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { @@ -612,7 +699,7 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // the DistMatrix isn't attached if ( !toBeAttachedForGet_ ) LogicError ("Cannot perform this operation as matrix is not attached."); - DistMatrix& X = *GlobalArrayGet_; + const DistMatrix& X = *GlobalArrayGet_; const Int height = Z.Height (); const Int width = Z.Width (); @@ -621,6 +708,8 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) LogicError ("Invalid submatrix for Iget"); T* XBuffer = Z.Buffer(); + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + const Grid & g = X.Grid (); const Int p = g.Size (); const Int r = g.Height (); @@ -635,7 +724,7 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) const Int cindex = NextIndexCoord (i, j, rank, - XBuffer, + Buffer, &coord_index); Int *coord = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data ()); @@ -680,7 +769,8 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // Unpack the local matrix for (Int t = 0; t < localWidth; ++t) { - T *YCol = X.Buffer (0, rowShift + t * c); + //T *YCol = X.Buffer (0, rowShift + t * c); + T *YCol = Z.Buffer (0, rowShift + t * c); const T *XCol = &recvBuffer[t * localHeight]; for (Int s = 0; s < localHeight; ++s) YCol[colShift + s * r] = XCol[s]; @@ -693,7 +783,7 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // nonblocking, no local completion template -void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iput")) @@ -726,7 +816,9 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) Int receivingCol = myProcessCol; const Int YLDim = Y.LDim (); - T* XBuffer = Z.Buffer(); + + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); for( Int step=0; step::Iput( Matrix& Z, Int i, Int j ) const Int dindex = NextIndexData (destination, numEntries, - XBuffer, + Buffer, &matrix_index); DEBUG_ONLY (if @@ -770,7 +862,7 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) const Int cindex = NextIndexCoord (i, j, destination, - XBuffer, + Buffer, &coord_index); Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); @@ -790,6 +882,12 @@ void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) } } +template +void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) +{ + Iput( const_cast&>(Z), i, j ); +} + template void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) { @@ -799,11 +897,12 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) // the DistMatrix isn't attached if ( !toBeAttachedForGet_ ) LogicError ("Cannot perform this operation as matrix is not attached."); - DistMatrix& X = *GlobalArrayGet_; + const DistMatrix& X = *GlobalArrayGet_; const Int height = Z.Height (); const Int width = Z.Width (); - T* XBuffer = Z.Buffer(); + + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); Int coord_index; @@ -820,7 +919,7 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) const Int cindex = NextIndexCoord (i, j, rank, - XBuffer, + Buffer, &coord_index); Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data()); @@ -838,7 +937,7 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template -void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Iacc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iacc")) @@ -872,7 +971,9 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) Int receivingCol = myProcessCol; const Int YLDim = Y.LDim(); - T* XBuffer = Z.Buffer(); + + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); for( Int step=0; step::Iacc( Matrix& Z, Int i, Int j ) const Int dindex = NextIndexData (destination, numEntries, - XBuffer, + Buffer, &matrix_index); DEBUG_ONLY (if @@ -914,7 +1015,7 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) const Int cindex = NextIndexCoord (i, j, destination, - XBuffer, + Buffer, &coord_index); Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[destination][cindex].data()); @@ -933,9 +1034,15 @@ void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) } } +template +void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) +{ + Iacc( const_cast&>(Z), i, j ); +} + // nonblocking, local completion template -void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Put( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) @@ -976,13 +1083,11 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) dataVectors_[my_rank].resize (numCreated + 1); dataVectors_[my_rank][numCreated].resize (width * height); - // TODO Buffer could be const, but at present - // NextIndexMatrix only supports non-const - // pointers - T* Buffer = Z.Buffer(); + const void* Buffer = static_cast < void * >(const_cast < T * >(Z.LockedBuffer())); T* ZBuffer = reinterpret_cast < T * >(dataVectors_[my_rank][numCreated].data()); - MemCopy (ZBuffer, Buffer, height * width); + MemCopy (ZBuffer, reinterpret_cast < const T * >(Buffer), + height * width); T* XBuffer = reinterpret_cast < T * >(ZBuffer); for( Int step=0; step::Put( Matrix& Z, Int i, Int j ) } } +template +void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +{ + Put( const_cast&>(Z), i, j ); +} + // input buffer could be modified upon exit // from this function - -// as it turns out, it is perhaps impossible -// to ensure local completion using mpi-2, -// hence we memcpy the input buffer so that -// local buffer could be updated when this -// function returns template -void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +void AxpyInterface2::Acc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) @@ -1097,13 +1202,11 @@ void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) dataVectors_[my_rank].resize (numCreated + 1); dataVectors_[my_rank][numCreated].resize (width * height); - // TODO Buffer could be const, but at present - // NextIndexMatrix only supports non-const - // pointers - T* Buffer = Z.Buffer(); + const void* Buffer = static_cast < void * >(const_cast < T * >(Z.LockedBuffer())); T* ZBuffer = reinterpret_cast < T * >(dataVectors_[my_rank][numCreated].data()); - MemCopy (ZBuffer, Buffer, height * width); + MemCopy (ZBuffer, reinterpret_cast < const T * >(Buffer), + height * width); T* XBuffer = reinterpret_cast < T * >(ZBuffer); for( Int step=0; step::Acc( Matrix& Z, Int i, Int j ) } } +template +void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +{ + Acc( const_cast&>(Z), i, j ); +} + // waitany implementation // cannot use mpi::Waitany // as of now because request // objects are vector of deques template -void AxpyInterface2::WaitAny( Matrix& Z ) +void AxpyInterface2::WaitAny( const Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::WaitAny")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError("Must initiate transfer at first."); - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - const T *base_address = Z.LockedBuffer(); - + const void* base_address = static_cast(const_cast(Z.LockedBuffer())); + // search for matrix base for (Int m = 0; m < numMatrices; m++) { @@ -1255,21 +1365,28 @@ void AxpyInterface2::WaitAny( Matrix& Z ) } template -void AxpyInterface2::Wait( Matrix& Z ) +void AxpyInterface2::WaitAny( Matrix& Z ) +{ + WaitAny( const_cast&>(Z) ); +} + +template +void AxpyInterface2::Wait( const Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Wait")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError("Must initiate transfer at first."); - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - const T *base_address = Z.LockedBuffer(); + const void* base_address = static_cast(const_cast(Z.LockedBuffer())); // search for matrix base for (Int m = 0; m < numMatrices; m++) @@ -1329,17 +1446,24 @@ void AxpyInterface2::Wait( Matrix& Z ) } } +template +void AxpyInterface2::Wait( Matrix& Z ) +{ + Wait( const_cast&>(Z) ); +} + template void AxpyInterface2::Waitall () { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Waitall")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError("Must initiate transfer at first."); - DistMatrix& Y = *GlobalArrayGet_; - Int matrixIndex, coordIndex; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); + Int matrixIndex, coordIndex; const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); @@ -1376,21 +1500,22 @@ void AxpyInterface2::Waitall () } template -bool AxpyInterface2::Test( Matrix& Z ) +bool AxpyInterface2::Test( const Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Test")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError("Must initiate transfer at first."); - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - const T *base_address = Z.LockedBuffer(); + const void* base_address = static_cast(const_cast(Z.LockedBuffer())); // search for matrix base for (Int m = 0; m < numMatrices; m++) @@ -1453,26 +1578,29 @@ bool AxpyInterface2::Test( Matrix& Z ) return true; } -// TODO Use mpi::Testany instead of mpi::Test -// at present request object is vector -// of deques, so cannot convert it to -// an array required by Testany template -bool AxpyInterface2::TestAny( Matrix& Z ) +bool AxpyInterface2::Test( Matrix& Z ) +{ + return Test( const_cast&>(Z) ); +} + +template +bool AxpyInterface2::TestAny( const Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::TestAny")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError("Must initiate transfer at first."); - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - const T *base_address = Z.LockedBuffer(); + const void* base_address = static_cast(const_cast(Z.LockedBuffer())); // search for matrix base for (Int m = 0; m < numMatrices; m++) @@ -1539,15 +1667,22 @@ bool AxpyInterface2::TestAny( Matrix& Z ) return false; } +template +bool AxpyInterface2::TestAny( Matrix& Z ) +{ + return TestAny( const_cast&>(Z) ); +} + template bool AxpyInterface2::Testall() { DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Testall")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError("Must initiate transfer at first."); - DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); const Int numMatrices = matrices_.size(); @@ -1598,6 +1733,47 @@ bool AxpyInterface2::Testall() // This is non-collective flush // This will ensure local+remote completion +// if Z is const then only Put/Acc is possible +template +void AxpyInterface2::Flush( const Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) + + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + + bool DONE = false; + mpi::Status status; + + while ( !DONE ) + { + if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) + { + switch (status.MPI_TAG) + { + case DATA_PUT_TAG: + { + HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); + break; + } + } + } + // wait for requests to + // complete one by one + WaitAny (Z); + DONE = Test (Z); + } +} + template void AxpyInterface2::Flush( Matrix& Z ) { @@ -1605,10 +1781,11 @@ void AxpyInterface2::Flush( Matrix& Z ) if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); - DistMatrix& Y = *GlobalArrayPut_; - const Grid& g = Y.Grid(); - bool DONE = false; mpi::Status status; @@ -1643,7 +1820,7 @@ void AxpyInterface2::Flush( Matrix& Z ) } template < typename T > -void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int source ) +void AxpyInterface2::HandleLocalToGlobalData ( const Matrix& Z, Int source ) { DistMatrix &Y = *GlobalArrayPut_; const Grid & g = Y.Grid (); @@ -1677,7 +1854,7 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int source ) DATA_PUT_TAG, g.VCComm()); // Update Y - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const T *XBuffer = const_cast < const T * >(getBuffer); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; const Int colShift = Shift (myRow, colAlign, r); @@ -1699,9 +1876,15 @@ void AxpyInterface2::HandleLocalToGlobalData ( Matrix& Z, Int source ) getVector_.clear(); } +template +void AxpyInterface2::HandleLocalToGlobalData( Matrix& Z, Int source ) +{ + HandleLocalToGlobalData( const_cast&>(Z), source ); +} + // replica of above function except this accumulates template < typename T > -void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int source ) +void AxpyInterface2::HandleLocalToGlobalAcc ( const Matrix& Z, Int source ) { DistMatrix &Y = *GlobalArrayPut_; const Grid & g = Y.Grid (); @@ -1736,7 +1919,7 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int source ) DATA_ACC_TAG, g.VCComm()); // Update Y - const T *XBuffer = reinterpret_cast < const T * >(getBuffer); + const T *XBuffer = const_cast < const T * >(getBuffer); const Int colAlign = (Y.ColAlign() + i) % r; const Int rowAlign = (Y.RowAlign() + j) % c; const Int colShift = Shift (myRow, colAlign, r); @@ -1759,6 +1942,12 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( Matrix& Z, Int source ) getVector_.clear(); } +template +void AxpyInterface2::HandleLocalToGlobalAcc( Matrix& Z, Int source ) +{ + HandleLocalToGlobalAcc( const_cast&>(Z), source ); +} + // handle request for data, post a matching isend template < typename T > void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) @@ -1768,7 +1957,7 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) if ( !toBeAttachedForGet_ ) LogicError("Local matrix cannot be updated"); - DistMatrix& Y = *GlobalArrayGet_; + const DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); const Int r = g.Height(); const Int c = g.Width(); @@ -1780,6 +1969,8 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) Int matrix_index; std::vector recvVector_; + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); @@ -1816,11 +2007,10 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) DEBUG_ONLY (if (numEntries < Int (sizeof (T))) LogicError ("Count was too small");) - T* XBuffer = Z.Buffer(); const Int index = NextIndexData (source, numEntries, - XBuffer, + Buffer, &matrix_index); DEBUG_ONLY (if @@ -1906,8 +2096,9 @@ void AxpyInterface2::Detach() GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - dataVectors_.clear(); - + if (!dataVectors_.empty()) + dataVectors_.clear(); + matrices_.clear(); coords_.clear(); } diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index fe7aaead4b..0a4f383489 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -2,7 +2,7 @@ Copyright (c) 2009-2014, Jack Poulson Copyright (c) 2011, The University of Texas at Austin Copyright (c) 2014, Jeff Hammond (Intel) - Copyright (c) 2014, Sayan Ghosh (University of Houston) + Copyright (c) 2014, Sayan Ghosh (Washington State University) All rights reserved. Authors: @@ -17,17 +17,17 @@ which can be found in the LICENSE file in the root directory, or at // TODO Complete the const interfaces... // TODO RMA related checks pending (e.g bounds checking)... -// TODO Add a logic in Flush to return immediately if Get is used +// TODO debug messages #if MPI_VERSION>=3 namespace El { - template RmaInterface::RmaInterface() : GlobalArrayPut_(0), GlobalArrayGet_(0), - putVector_(0), getVector_(0), window (MPI_WIN_NULL), + matrices_(0), window (MPI_WIN_NULL), + putVector_(0), getVector_(0), toBeAttachedForPut_(false), toBeAttachedForGet_(false), - attached_(false), detached_(false) + attached_(false), detached_(true) { } template @@ -35,39 +35,36 @@ RmaInterface::RmaInterface( DistMatrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - attached_ = false; - detached_ = false; - toBeAttachedForGet_ = true; - toBeAttachedForPut_ = true; - GlobalArrayPut_ = &Z; - GlobalArrayGet_ = &Z; - window = MPI_WIN_NULL; - - const Int p = Z.Grid ().Size(); - - putVector_.resize( p ); - getVector_.resize( p ); + attached_ = false; + detached_ = true; + toBeAttachedForGet_ = false; + toBeAttachedForPut_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + + window = MPI_WIN_NULL; } +// until attach, I am not setting anything +// which might not be a good thing to do, +// but would modify this eventually template RmaInterface::RmaInterface( const DistMatrix& X ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - attached_ = false; - detached_ = false; - toBeAttachedForGet_ = true; - toBeAttachedForPut_ = false; - GlobalArrayGet_ = &X; - GlobalArrayPut_ = 0; - window = MPI_WIN_NULL; - - const Int p = X.Grid ().Size (); - - getVector_.resize( p ); - putVector_.resize( p ); + attached_ = false; + detached_ = true; + toBeAttachedForGet_ = false; + toBeAttachedForPut_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + + window = MPI_WIN_NULL; } template @@ -99,8 +96,11 @@ void RmaInterface::Attach( DistMatrix& Z ) DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) // attached_ will be only set in Attach // and only unset in Detach - if (!attached_) + if (!attached_ && detached_) + { attached_ = true; + detached_ = false; + } else LogicError("Must detach before reattaching."); @@ -108,21 +108,28 @@ void RmaInterface::Attach( DistMatrix& Z ) // transfers -- put, get and acc are possible if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) { - GlobalArrayPut_ = &Z; - toBeAttachedForPut_ = true; - GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; } + const Grid& g = Z.Grid(); const Int p = g.Size (); + if (matrices_.size() != p) + { + matrices_.resize( p ); + } + if (putVector_.size() != p) { getVector_.resize( p ); putVector_.resize( p ); } - // do rma related checks + // TODO rma related checks + // creation of window const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); const Int bufferSize = numEntries * sizeof(T); void* baseptr = reinterpret_cast(Z.Buffer ()); @@ -132,12 +139,16 @@ void RmaInterface::Attach( DistMatrix& Z ) mpi::WindowLock (window); } +// for gets template void RmaInterface::Attach( const DistMatrix& X ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - if (!attached_) + if (!attached_ && detached_) + { attached_ = true; + detached_ = false; + } else LogicError("Must detach before reattaching."); @@ -154,22 +165,26 @@ void RmaInterface::Attach( const DistMatrix& X ) const Grid& g = X.Grid(); const Int p = g.Size (); - if (putVector_.size() != p) + if (matrices_.size() != p) + { + matrices_.resize( p ); + } + if (getVector_.size() != p) { getVector_.resize( p ); - putVector_.resize( p ); } - - //do rma related checks + + //TODO rma related checks const Int numEntries = X.LocalHeight () * X.LocalWidth (); const Int bufferSize = numEntries * sizeof(T); - void* baseptr = (void*)(X.LockedBuffer ()); + void* baseptr = static_cast(const_cast(X.LockedBuffer ())); assert (baseptr != NULL); mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); mpi::WindowLock (window); } +// for standard passive rma template Int RmaInterface::NextIndex ( Int dataSize, @@ -185,86 +200,96 @@ Int RmaInterface::NextIndex return numCreated; } -// Locally Blocking +// for request-based passive rma template -void RmaInterface::Put( Matrix& Z, Int i, Int j ) +Int RmaInterface::NextIndex ( + Int target, + Int dataSize, + const void* base_address, + Int *mindex) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); - - DistMatrix& Y = *GlobalArrayPut_; - //do rma related checks - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + DEBUG_ONLY (CallStackEntry cse ("RmaInterface::NextIndex")) + + assert ( base_address != NULL ); + Int matrixIndex = 0; + const DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int XLDim = Z.LDim(); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); - - const Int YLDim = Y.LDim (); + const Int numMatrices = matrices_.size(); - for( Int step=0; step::NextIndex ( numEntries, - putVector_[destination]); - - T* sendBuffer = putVector_[destination][index].data(); - T* XBuffer = Z.Buffer(); - - for( Int t=0; t -void RmaInterface::Put( const Matrix& Z, Int i, Int j ) +void RmaInterface::Rput( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Rput")) + if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForPut_ ) @@ -296,6 +321,10 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) const Int jLocalOffset = Length( j, Y.RowShift (), c ); const Int YLDim = Y.LDim (); + + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + Int matrix_index; for( Int step=0; step::Put( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination]); - T* sendBuffer = putVector_[destination][index].data(); - const T* XBuffer = Z.LockedBuffer(); - + const Int index = + NextIndex (destination, + numEntries, + Buffer, + &matrix_index); + + DEBUG_ONLY (if + (Int (matrices_[matrix_index].data_[destination][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + + T *sendBuffer = matrices_[matrix_index].data_[destination][index].data (); + for( Int t=0; t::Put( const Matrix& Z, Int i, Int j ) // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template -void RmaInterface::Acc( Matrix& Z, Int i, Int j ) -{ - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) - - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated."); - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative."); - - DistMatrix& Y = *GlobalArrayPut_; - - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix."); - - //do rma related checks - - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int XLDim = Z.LDim(); - const Int YLDim = Y.LDim (); - // local matrix width and height - const Int height = Z.Height(); - const Int width = Z.Width(); - - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); - - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - - for( Int step=0; step::NextIndex ( numEntries, - putVector_[destination]); - - T* sendBuffer = putVector_[destination][index].data(); - T* XBuffer = Z.Buffer(); - - for( Int t=0; t -void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) +void RmaInterface::Racc( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Racc")) if ( !toBeAttachedForPut_ ) LogicError("Global matrix cannot be updated."); @@ -426,7 +388,6 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) LogicError("Submatrix out of bounds of global matrix."); //do rma related checks - const Grid& g = Y.Grid(); const Int r = g.Height(); const Int c = g.Width(); @@ -441,7 +402,12 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - + + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + + Int matrix_index; + const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); @@ -460,10 +426,17 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination]); - T* sendBuffer = putVector_[destination][index].data(); - const T* XBuffer = Z.LockedBuffer(); + const Int index = + NextIndex (destination, + numEntries, + Buffer, + &matrix_index); + + DEBUG_ONLY (if + (Int (matrices_[matrix_index].data_[destination][index].size ()) != + numEntries) LogicError ("Error in NextIndex");) + + T *sendBuffer = matrices_[matrix_index].data_[destination][index].data (); for( Int t=0; t::Acc( const Matrix& Z, Int i, Int j ) const T* thisXCol = &XBuffer[(rowShift+t*c)*XLDim]; for( Int s=0; s::Acc( const Matrix& Z, Int i, Int j ) } } +// Locally Blocking template -void RmaInterface::Iput( Matrix& Z, Int i, Int j ) +void RmaInterface::Put( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iput")) - + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) + if( i < 0 || j < 0 ) LogicError("Submatrix offsets must be non-negative"); if ( !toBeAttachedForPut_ ) @@ -520,6 +495,8 @@ void RmaInterface::Iput( Matrix& Z, Int i, Int j ) const Int jLocalOffset = Length( j, Y.RowShift (), c ); const Int YLDim = Y.LDim (); + + const T* XBuffer = Z.LockedBuffer(); for( Int step=0; step::Iput( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, + const Int index = + NextIndex ( numEntries, putVector_[destination]); - - T* sendBuffer = putVector_[destination][index].data(); - T* XBuffer = Z.Buffer(); + + T* sendBuffer = putVector_[destination][index].data(); for( Int t=0; t::Iput( Matrix& Z, Int i, Int j ) mpi::Iput (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window); } + mpi::FlushLocal (destination, window); } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } -// Note: This as of now does not progress -// RMA routines -#ifdef EL_EXPLICIT_PROGRESS - RmaProgress (g.VCComm ()); -#endif } template -void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) +void RmaInterface::Put( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iput")) - - if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + Put( const_cast&>(Z), i, j ); +} + +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width +template +void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError("Global matrix cannot be updated."); + if( i < 0 || j < 0 ) + LogicError("Submatrix offsets must be non-negative."); DistMatrix& Y = *GlobalArrayPut_; - //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError("Submatrix out of bounds of global matrix."); + + //do rma related checks const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -587,18 +570,19 @@ void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; const Int XLDim = Z.LDim(); + const Int YLDim = Y.LDim (); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - const Int YLDim = Y.LDim (); - + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + const T* XBuffer = Z.LockedBuffer(); + for( Int step=0; step::Iput( const Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, putVector_[destination]); + T* sendBuffer = putVector_[destination][index].data(); - const T* XBuffer = Z.LockedBuffer(); for( Int t=0; t::Iput( const Matrix& Z, Int i, Int j ) const T* thisXCol = &XBuffer[(rowShift+t*c)*XLDim]; for( Int s=0; s +void RmaInterface::Acc( Matrix& Z, Int i, Int j ) +{ + Acc( const_cast&>(Z), i, j ); +} + +// TODO Iget and Rget template void RmaInterface::Get( Matrix& Z, Int i, Int j ) { @@ -701,7 +691,8 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) // no difference between localflush // and flush for Get mpi::FlushLocal (destination, window); - // update local matrix + + // update local matrix for( Int t=0; t::Get( Matrix& Z, Int i, Int j ) } } -// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, -// where X is height x width +// non-blocking interface template -void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) +void RmaInterface::Iput( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iacc")) + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iput")) - if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated."); if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative."); + LogicError("Submatrix offsets must be non-negative"); + if ( !toBeAttachedForPut_ ) + LogicError("Global matrix cannot be updated"); DistMatrix& Y = *GlobalArrayPut_; - - if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix."); - //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError("Submatrix out of bounds of global matrix"); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -745,16 +733,19 @@ void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) const Int rowAlign = (Y.RowAlign() + j) % c; const Int XLDim = Z.LDim(); - const Int YLDim = Y.LDim (); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; + const Int YLDim = Y.LDim (); + + const T* XBuffer = Z.LockedBuffer(); for( Int step=0; step::Iacc( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination]); + const Int index = + NextIndex ( + numEntries, + putVector_[destination] + ); T* sendBuffer = putVector_[destination][index].data(); - T* XBuffer = Z.Buffer(); for( Int t=0; t::Iacc( Matrix& Z, Int i, Int j ) if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } -#ifdef EL_EXPLICIT_PROGRESS - RmaProgress (g.VCComm ()); -#endif } +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width template -void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) +void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iacc")) @@ -810,8 +802,7 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) LogicError("Submatrix out of bounds of global matrix."); - //do rma related checks - + //TODO rma related checks const Grid& g = Y.Grid(); const Int r = g.Height(); const Int c = g.Width(); @@ -826,7 +817,9 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - + + const T* XBuffer = Z.LockedBuffer(); + const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); @@ -847,8 +840,8 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; const Int index = RmaInterface::NextIndex ( numEntries, putVector_[destination]); + T* sendBuffer = putVector_[destination][index].data(); - const T* XBuffer = Z.LockedBuffer(); for( Int t=0; t::Iacc( const Matrix& Z, Int i, Int j ) if( receivingRow == 0 ) receivingCol = (receivingCol + 1) % c; } -#ifdef EL_EXPLICIT_PROGRESS - RmaProgress (g.VCComm ()); -#endif } +// Local completion of all ops upon +// return +template +void RmaInterface::LocalFlush() +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::LocalFlush")) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + mpi::FlushLocal ( window ); +} + +// Local completion (specific to Z) upon +// return template -void RmaInterface::Flush( Matrix& Z, Int i, Int j ) +void RmaInterface::LocalFlush( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::LocalFlush")) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError("Must initiate transfer before flushing."); + + // if there are no request based RMA pending + // for Z, then this functions acts like Flush + // local all + if ( !anyPendingXfers( Z ) ) + { + LocalFlush(); + } + else + Wait ( Z ); +} + +// there is no use as of now in +// passing Z, as mpi3 flush enforces +// completion of *all* operations on +// process window +template +void RmaInterface::Flush( Matrix& Z ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError("Must initiate transfer before flushing."); + + mpi::Flush ( window ); +} - DistMatrix& Y = *GlobalArrayPut_; +template +bool RmaInterface::anyPendingXfers ( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::anyPendingXfers")) - //do rma related checks - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); - const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; + // by default, number of matrices + // == number of processes + Int matrixIndex; + const Int numMatrices = matrices_.size(); + const void *base_address = static_cast(const_cast(Z.LockedBuffer())); - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + // search for matrix base + for (Int m = 0; m < numMatrices; m++) + { + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + matrixIndex = m+1; + } - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step +void RmaInterface::WaitAny( Matrix& Z ) +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::WaitAny")) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + + const Int p = g.Size(); + Int matrixIndex; + + const Int numMatrices = matrices_.size(); + + const void *base_address = static_cast(const_cast(Z.LockedBuffer())); + + // search for matrix base + for (Int m = 0; m < numMatrices; m++) { - const Int colShift = Shift( receivingRow, colAlign, r ); - const Int rowShift = Shift( receivingCol, rowAlign, c ); - const Int localHeight = Length( height, colShift, r ); - const Int localWidth = Length( width, rowShift, c ); - const Int numEntries = localHeight*localWidth; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + matrixIndex = m+1; + } - if( numEntries != 0 ) - { - const Int destination = receivingRow + r*receivingCol; - mpi::Flush ( destination, window ); - } + // matrix not found + if ( matrixIndex == numMatrices ) + return; - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + // data + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + if (!matrices_[matrixIndex].statuses_[rank][i]) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + return; + } + } } } template -void RmaInterface::Flush( const Matrix& Z, Int i, Int j ) +void RmaInterface::Wait( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - - const DistMatrix& Y = *GlobalArrayGet_; + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Wait")) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); - //do rma related checks - const Grid& g = Y.Grid(); - const Int r = g.Height(); - const Int c = g.Width(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); - const Int myProcessRow = g.Row(); - const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; + Int matrixIndex; + + const Int numMatrices = matrices_.size(); + + const void *base_address = static_cast(const_cast(Z.LockedBuffer())); - // local width and height - const Int height = Z.Height(); - const Int width = Z.Width(); + // search for matrix base + for (Int m = 0; m < numMatrices; m++) + { + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + matrixIndex = m+1; + } - // find destination - Int receivingRow = myProcessRow; - Int receivingCol = myProcessCol; - for( Int step=0; step +void RmaInterface::Waitall () +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Waitall")) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex; + + const Int numMatrices = matrices_.size(); + + // data + for (int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex) + { + for (int rank = 0; rank < p; ++rank) + { + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } } } template -void RmaInterface::Flush( Matrix& Z ) +bool RmaInterface::Test( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Test")) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + + const Int p = g.Size(); + Int matrixIndex; - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - // Get does not require flush because - // we guarantee local completion for - // Put/Get/Acc - if( toBeAttachedForGet_ ) - return; + const Int numMatrices = matrices_.size(); + + const void *base_address = static_cast(const_cast(Z.LockedBuffer())); - mpi::Flush ( window ); + // search for matrix base + for (Int m = 0; m < numMatrices; m++) + { + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + matrixIndex = m+1; + } + + // matrix not found + if ( matrixIndex == numMatrices ) + return true; + + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); + if (matrices_[matrixIndex].statuses_[rank][i]) + return false; + } + } + + return true; } +// TODO Use mpi::Testany instead of mpi::Test +// at present request object is vector +// of deques, so cannot convert it to +// an array required by Testany template -void RmaInterface::Flush( const Matrix& Z ) +bool RmaInterface::TestAny( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) + DEBUG_ONLY(CallStackEntry cse("RmaInterface::TestAny")) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); - if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - // Get does not require flush because - // we guarantee local completion for - // Put/Get/Acc - if( toBeAttachedForGet_ ) - return; + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + + const Int p = g.Size(); + Int matrixIndex; - mpi::Flush ( window ); + const Int numMatrices = matrices_.size(); + + const void *base_address = static_cast(const_cast(Z.LockedBuffer())); + + // search for matrix base + for (Int m = 0; m < numMatrices; m++) + { + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + matrixIndex = m+1; + } + + // matrix not found + if ( matrixIndex == numMatrices ) + return true; + + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); + if (matrices_[matrixIndex].statuses_[rank][i]) + continue; + else + return true; + } + } + + return false; +} + +template +bool RmaInterface::Testall() +{ + DEBUG_ONLY(CallStackEntry cse("RmaInterface::Testall")) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError("Must initiate transfer at first."); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + + const Int numMatrices = matrices_.size(); + + // data + for (int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex) + { + for (int rank = 0; rank < p; ++rank) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for (int i = 0; i < numDataStatuses; i++) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); + if (matrices_[matrixIndex].statuses_[rank][i]) + return false; + } + } + } + + return true; } template @@ -1021,6 +1246,7 @@ void RmaInterface::Detach() putVector_.clear(); getVector_.clear(); + matrices_.clear(); mpi::WindowUnlock (window); mpi::WindowFree (window); From 95c9c4be15ab7ea4524b5d7aeaef0cfb9313d50f Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Sat, 14 Feb 2015 02:59:35 -0800 Subject: [PATCH 103/110] added some const interfaces, and fixed an error for request based rma ops --- src/core/AxpyInterface2.0.cpp | 2 +- src/core/RmaInterface.cpp | 160 ++++++++++++++++++++-------------- 2 files changed, 95 insertions(+), 67 deletions(-) diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 548c6f3fa6..fe09182685 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -818,7 +818,7 @@ void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) const Int YLDim = Y.LDim (); const T* XBuffer = Z.LockedBuffer(); - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + const void* Buffer = static_cast< void * >(const_cast< T * >(Z.LockedBuffer())); for( Int step=0; step::Attach( DistMatrix& Z ) // transfers -- put, get and acc are possible if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) { - GlobalArrayPut_ = &Z; - toBeAttachedForPut_ = true; - GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; - } - - const Grid& g = Z.Grid(); - const Int p = g.Size (); + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; - if (matrices_.size() != p) - { - matrices_.resize( p ); - } + const Grid& g = Z.Grid(); + const Int p = g.Size (); - if (putVector_.size() != p) - { - getVector_.resize( p ); - putVector_.resize( p ); - } + if ( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize(p); + mp.requests_.resize(p); + mp.statuses_.resize(p); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } - // TODO rma related checks - // creation of window - const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); - const Int bufferSize = numEntries * sizeof(T); - void* baseptr = reinterpret_cast(Z.Buffer ()); - assert(baseptr != NULL); + if (putVector_.empty()) + { + getVector_.resize( p ); + putVector_.resize( p ); + } - mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); - mpi::WindowLock (window); + // TODO rma related checks + // creation of window + const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); + const Int bufferSize = numEntries * sizeof(T); + void* baseptr = reinterpret_cast< void * >(Z.Buffer ()); + assert(baseptr != NULL); + + mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); + mpi::WindowLock (window); + } } // for gets @@ -152,36 +159,30 @@ void RmaInterface::Attach( const DistMatrix& X ) else LogicError("Must detach before reattaching."); - if( !toBeAttachedForGet_ && !toBeAttachedForPut_) + if( !toBeAttachedForGet_ ) { - GlobalArrayGet_ = &X; - toBeAttachedForGet_ = true; - GlobalArrayPut_ = 0; - toBeAttachedForPut_ = false; - } - else - LogicError("Cannot update Global matrix."); + GlobalArrayGet_ = &X; + toBeAttachedForGet_ = true; + GlobalArrayPut_ = 0; + toBeAttachedForPut_ = false; - const Grid& g = X.Grid(); - const Int p = g.Size (); + const Grid& g = X.Grid(); + const Int p = g.Size (); - if (matrices_.size() != p) - { - matrices_.resize( p ); - } - if (getVector_.size() != p) - { - getVector_.resize( p ); - } - - //TODO rma related checks - const Int numEntries = X.LocalHeight () * X.LocalWidth (); - const Int bufferSize = numEntries * sizeof(T); - void* baseptr = static_cast(const_cast(X.LockedBuffer ())); - assert (baseptr != NULL); + if (getVector_.size() != p) + { + getVector_.resize( p ); + } - mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); - mpi::WindowLock (window); + //TODO rma related checks + const Int numEntries = X.LocalHeight () * X.LocalWidth (); + const Int bufferSize = numEntries * sizeof(T); + void* baseptr = static_cast(const_cast(X.LockedBuffer ())); + assert (baseptr != NULL); + + mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); + mpi::WindowLock (window); + } } // for standard passive rma @@ -213,8 +214,9 @@ Int RmaInterface::NextIndex ( assert ( base_address != NULL ); Int matrixIndex = 0; - const DistMatrix& Y = *GlobalArrayGet_; - const Grid& g = Y.Grid(); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); const Int numMatrices = matrices_.size(); @@ -286,7 +288,7 @@ Int RmaInterface::NextIndex ( // request based RMA operations template -void RmaInterface::Rput( Matrix& Z, Int i, Int j ) +void RmaInterface::Rput( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Rput")) @@ -348,7 +350,7 @@ void RmaInterface::Rput( Matrix& Z, Int i, Int j ) (Int (matrices_[matrix_index].data_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = matrices_[matrix_index].data_[destination][index].data (); + T *sendBuffer = reinterpret_cast(matrices_[matrix_index].data_[destination][index].data ()); for( Int t=0; t::Rput( Matrix& Z, Int i, Int j ) } } +template +void RmaInterface::Rput( Matrix& Z, Int i, Int j ) +{ + Rput( const_cast&>(Z), i, j ); +} + // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template -void RmaInterface::Racc( Matrix& Z, Int i, Int j ) +void RmaInterface::Racc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Racc")) @@ -403,9 +411,9 @@ void RmaInterface::Racc( Matrix& Z, Int i, Int j ) const Int height = Z.Height(); const Int width = Z.Width(); - const T* XBuffer = Z.LockedBuffer(); - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); - + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast < void * >(const_cast < T * >(Z.LockedBuffer())); + Int matrix_index; const Int iLocalOffset = Length( i, Y.ColShift (), r ); @@ -436,8 +444,8 @@ void RmaInterface::Racc( Matrix& Z, Int i, Int j ) (Int (matrices_[matrix_index].data_[destination][index].size ()) != numEntries) LogicError ("Error in NextIndex");) - T *sendBuffer = matrices_[matrix_index].data_[destination][index].data (); - + T *sendBuffer = reinterpret_cast(matrices_[matrix_index].data_[destination][index].data ()); + for( Int t=0; t::Racc( Matrix& Z, Int i, Int j ) mpi::Racc (&sendBuffer[t*localHeight], localHeight, destination, disp, localHeight, window, matrices_[matrix_index].requests_[destination][index]); - } + } } receivingRow = (receivingRow + 1) % r; if( receivingRow == 0 ) @@ -458,6 +466,12 @@ void RmaInterface::Racc( Matrix& Z, Int i, Int j ) } } +template +void RmaInterface::Racc( Matrix& Z, Int i, Int j ) +{ + Racc( const_cast&>(Z), i, j ); +} + // Locally Blocking template void RmaInterface::Put( const Matrix& Z, Int i, Int j ) @@ -709,7 +723,7 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) // non-blocking interface template -void RmaInterface::Iput( Matrix& Z, Int i, Int j ) +void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iput")) @@ -788,7 +802,7 @@ void RmaInterface::Iput( Matrix& Z, Int i, Int j ) // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template -void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) +void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iacc")) @@ -838,8 +852,10 @@ void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination]); + const Int index = + NextIndex ( + numEntries, + putVector_[destination]); T* sendBuffer = putVector_[destination][index].data(); @@ -861,6 +877,18 @@ void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) } } +template +void RmaInterface::Iput( Matrix& Z, Int i, Int j ) +{ + Iput( const_cast&>(Z), i, j ); +} + +template +void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) +{ + Iacc( const_cast&>(Z), i, j ); +} + // Local completion of all ops upon // return template From 2fb071ccc34da1ef3201d25f4bd560c376b1ecbc Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 23 Feb 2015 12:51:04 -0800 Subject: [PATCH 104/110] delete test case --- tests/core/HFsimul.cpp | 188 ----------------------------------------- 1 file changed, 188 deletions(-) delete mode 100644 tests/core/HFsimul.cpp diff --git a/tests/core/HFsimul.cpp b/tests/core/HFsimul.cpp deleted file mode 100644 index 1761497023..0000000000 --- a/tests/core/HFsimul.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - Copyright (c) 2009-2014, Jack Poulson - Copyright (c) 2011, The University of Texas at Austin - Copyright (c) 2014, Sayan Ghosh, University of Houston - All rights reserved. - - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at -http://opensource.org/licenses/BSD-2-Clause -*/ -/* - * This test approximates a Hartree-Fock - * application, all the ranks perform Acc - * (or Axpy) on different patches of the - * matrix during an epoch, then Flush all, - * then a Barrier, then another epoch - * where all the ranks perform Get (on - * their patch) - * Some of the MPI functions are not defined - * in El, hence this test mixes MPI routines - * and MPI from El. This is nasty, but at one - * point would be made better. - */ -#include "El.hpp" -#include -using namespace El; - -//#define ITER 10 -#define ITER 1 -//#define DIM 1000 -//#define AXPY_DIM 100 -//#define DIM 20 -//#define AXPY_DIM 4 -#define DIM 8 -#define AXPY_DIM 2 - -#define ALPHA 2.0 -#define FOP_ROOT 0 - -#if MPI_VERSION < 3 -# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 -#endif - -long ReadInc (MPI_Win win, MPI_Aint offset, long inc) -{ - long otemp; - MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, - win); - MPI_Win_flush (FOP_ROOT, win); - - return otemp; -} - -int main (int argc, char *argv[]) -{ - Initialize (argc, argv); - mpi::Comm comm = mpi::COMM_WORLD; - mpi::Window win; - const Int commRank = mpi::Rank (comm); - const Int commSize = mpi::Size (comm); - double t1, t2, seconds; - void *win_base; - long counter, next = 0; - - assert (DIM % AXPY_DIM == 0); - - try - { - // Initialization - // Allocate memory and create window for ReadInc - MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, - comm.comm, &win_base, &win); - memset (win_base, 0, sizeof (long)); - MPI_Win_lock_all (MPI_MODE_NOCHECK, win); - - // Create window - Grid grid (comm); - - // Create an DIM X DIM distributed matrix over the given grid - DistMatrix < double, MC, MR > A (DIM, DIM, grid); - - // Set every entry of A to zero - Zeros (A, DIM, DIM); - - // Print the original A - if (DIM <= 20) - Print (A, "Original distributed A"); - - t1 = MPI_Wtime(); - for (Int k = 0; k < ITER; ++k) - { - if (commRank == 0) - std::cout << "Iteration " << k << std::endl; - - RmaInterface < double > Rmaint; - Rmaint.Attach (A); - - Matrix < double >B (AXPY_DIM, AXPY_DIM); - Identity (B, AXPY_DIM, AXPY_DIM); - // AXPY into parts of the DistMatrix - counter = ReadInc (win, 0, (long) 1); - for (int i = 0; i < DIM; i += AXPY_DIM) - { - if (counter == next) - { - for (int j = 0; j < DIM; j += AXPY_DIM) - { - Rmaint.Acc (ALPHA, B, i, j); -#if DEBUG > 2 - std::cout << std::to_string(commRank) + ": AXPY patch: " - + std::to_string(i) + "," + std::to_string(j) - << std::endl; -#endif - } - counter = ReadInc (win, 0, (long) 1); - } - next++; - } - // Flush all operations from B to DistMatrix - Rmaint.Flush ( B ); - mpi::Barrier ( comm ); - // Bring my updated patch to me from DistMatrix - Matrix < double >C; - Zeros (C, AXPY_DIM, AXPY_DIM); - for (int i = 0; i < DIM; i += AXPY_DIM) - { - if (counter == next) - { - for (int j = 0; j < DIM; j += AXPY_DIM) - { - Rmaint.Get (C, i, j); -#if DEBUG > 2 - std::cout << std::to_string(commRank) + ": GET patch: " - + std::to_string(i) + "," + std::to_string(j) - << std::endl; -#endif - } - counter = ReadInc (win, 0, (long) 1); - } - next++; - } - // Get doesn't require flush - // Collectively detach in order to finish filling process 0's request - Rmaint.Detach (); - -#if DEBUG > 1 - for (int j = 0; j < commSize; j++) - { - if (j == commRank) - { - if (DIM <= 20) - Print (A, "Updated distributed A"); - } - } - mpi::Barrier ( comm ); - for (int j = 0; j < commSize; j++) - { - if (j == commRank) - { - // Process 0 can now locally print its copy of A - if (DIM <= 20) - Print (C, "Patch of A"); - } - } - mpi::Barrier ( comm ); -#endif - } - t2 = MPI_Wtime(); - seconds = (t2 - t1); ///ITER; - double total_secs; - - MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); - - if (commRank == 0) - printf("Time taken for AXPY (secs):%lf \n", total_secs); - } - catch (std::exception & e) - { - ReportException (e); - } - - // clear window object for FOP - MPI_Win_unlock_all (win); - MPI_Win_free (&win); - - mpi::Finalize (); - return 0; -} From 6bb22429af8da4bdfa0b6304995132a996139935 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 23 Feb 2015 18:35:14 -0800 Subject: [PATCH 105/110] merge --- src/core/AxpyInterface2.0.cpp | 9 +++------ src/core/RmaInterface.cpp | 12 +++--------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index fe09182685..6bf32ea4cc 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -3,7 +3,7 @@ This file is part of Elemental and is under the BSD 2-Clause License, which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ -#include "El-lite.hpp" +#include "El.hpp" #include namespace El @@ -2103,10 +2103,7 @@ void AxpyInterface2::Detach() coords_.clear(); } -template class AxpyInterface2; -template class AxpyInterface2; -template class AxpyInterface2; -template class AxpyInterface2>; -template class AxpyInterface2>; +#define PROTO(T) template class AxpyInterface2; +#include "El/macros/Instantiate.h" } // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 35c2fb81c6..70fd3a3f99 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -12,12 +12,9 @@ This file is part of Elemental and is under the BSD 2-Clause License, which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ -#include "El-lite.hpp" +#include "El.hpp" #include -// TODO Complete the const interfaces... -// TODO RMA related checks pending (e.g bounds checking)... -// TODO debug messages #if MPI_VERSION>=3 namespace El { @@ -1280,11 +1277,8 @@ void RmaInterface::Detach() mpi::WindowFree (window); } -template class RmaInterface; -template class RmaInterface; -template class RmaInterface; -template class RmaInterface>; -template class RmaInterface>; +#define PROTO(T) template class RmaInterface; +#include "El/macros/Instantiate.h" } // namespace El #endif From 86f4a805a88f38ea42111563794a98923905a567 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 23 Feb 2015 19:35:55 -0800 Subject: [PATCH 106/110] added a macro for rma axpy, so rma axpy gets enabled when mpi version >=3 and rma axpy macro is enabled --- include/El/core/RmaInterface.hpp | 4 ++-- include/El/core/imports/mpi.hpp | 24 ++++++++++++++++-------- src/core/RmaInterface.cpp | 5 +++-- src/core/imports/mpi.cpp | 8 ++++---- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index 92e6f442f9..d47a9233ac 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -16,7 +16,7 @@ #define EL_RMAINTERFACE_HPP namespace El { -#if MPI_VERSION>=3 +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) template class RmaInterface { @@ -119,6 +119,6 @@ class RmaInterface void WaitAny( Matrix& Z ); void WaitAny( const Matrix& Z ); }; -#endif //MPI-3 +#endif // EL_ENABLE_RMA_AXPY } // namespace El #endif // ifndef EL_RMAINTERFACE_HPP diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 72a52af11e..8f024ba176 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -34,13 +34,12 @@ namespace mpi { // consensus instead of El strict EOM matching // see - Scalable communication protocols for // dynamic sparse data exchange by Hoefler, et al -#ifndef EL_USE_NONBLOCKING_CONSENSUS -#define EL_USE_NONBLOCKING_CONSENSUS +#ifndef EL_USE_IBARRIER_FOR_AXPY +#define EL_USE_IBARRIER_FOR_AXPY #endif -// Experimental MPI performance enhancers -#ifndef EL_MPI_EXPERIMENTAL -#define EL_MPI_EXPERIMENTAL +#ifndef EL_ENABLE_RMA_AXPY +#define EL_ENABLE_RMA_AXPY #endif // Use derived datatypes for strided @@ -59,6 +58,15 @@ namespace mpi { //#define EL_NO_ACC_ORDERING //#endif +// put/get atomicity +//#ifndef EL_ENSURE_PUT_ATOMICITY +//#define EL_ENSURE_PUT_ATOMICITY +//#endif + +//#ifndef EL_ENSURE_GET_ATOMICITY +//#define EL_ENSURE_GET_ATOMICITY +//#endif + #ifndef EL_INT_SAFE_CAST #define EL_INT_SAFE_CAST(x) \ (x < std::numeric_limits::max () && \ @@ -240,7 +248,7 @@ void Translate // MPI-3 one-sided // =============== -#if MPI_VERSION>=3 +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) // Utilities // --------- void SetWindowProp ( Window& window, int prop ); @@ -343,11 +351,11 @@ void Flush( int target_rank, Window& window ); void Flush( Window & window ); void FlushLocal( int target_rank, Window& window ); void FlushLocal( Window & window ); -#endif +#endif // EL_ENABLE_RMA_AXPY // Utilities void Barrier( Comm comm ); -#if MPI_VERSION>=3 +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) void IBarrier( Comm comm, Request& request ); #endif void RequestFree( Request& request ); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 70fd3a3f99..8a69ab49f2 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -15,7 +15,7 @@ which can be found in the LICENSE file in the root directory, or at #include "El.hpp" #include -#if MPI_VERSION>=3 +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) namespace El { template @@ -1281,4 +1281,5 @@ void RmaInterface::Detach() #include "El/macros/Instantiate.h" } // namespace El -#endif +#endif // EL_ENABLE_RMA_AXPY + diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 853da2e0a8..e48599de9b 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -10,7 +10,7 @@ */ #include "El.hpp" -#include +#include typedef unsigned char* UCP; @@ -406,7 +406,7 @@ void Translate // MPI-3 RMA functions // ================== -#if MPI_VERSION>=3 +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) long ReadInc (Window & win, Aint offset, long inc, int fop_root) { DEBUG_ONLY (CallStackEntry cse ("mpi::ReadInc")) @@ -1363,7 +1363,7 @@ void FlushLocal (Window & window) DEBUG_ONLY (CallStackEntry cse ("mpi::FlushLocal")) SafeMpi (MPI_Win_flush_local_all (window)); } -#endif +#endif // EL_ENABLE_RMA_AXPY // Various utilities // ================= @@ -1381,7 +1381,7 @@ void Barrier (Comm comm) SafeMpi (MPI_Barrier (comm.comm)); } -#if MPI_VERSION>=3 +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) void IBarrier (Comm comm, Request & request) { DEBUG_ONLY (CallStackEntry cse ("mpi::IBarrier")) From b235bd2261559f44015238467c8fd1f0641f045c Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Mon, 23 Feb 2015 21:12:51 -0800 Subject: [PATCH 107/110] moved axpy related macros from header file to cmake build files --- CMakeLists.txt | 5 + cmake/config.h.cmake | 5 + include/El/core/imports/mpi.hpp | 41 +++--- src/core/imports/mpi.cpp | 241 ++++++++++++++++---------------- 4 files changed, 157 insertions(+), 135 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index de1d1f58e0..49486a1456 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,11 @@ option(EL_USE_64BIT_INTS "Use 64-bit integers where possible" OFF) option(EL_USE_CUSTOM_ALLTOALLV "Avoid MPI_Alltoallv for performance reasons" ON) option(EL_BARRIER_IN_ALLTOALLV "Barrier before posting non-blocking recvs" OFF) +# MPI misc. +# Enable MPI-3 routines +option(EL_ENABLE_RMA_AXPY "Choose new Rma Axpy interface implemented using MPI-3 one sided routines" ON) +option(EL_USE_IBARRIER_FOR_AXPY "Use MPI-3 IBarrier for synchronization in AxpyInterface" ON) + # If the version of METIS packaged with Elemental is to be built (the default), # then no METIS-specific variables need to be specified, but if the user prefers # to use their own version, then the root path of the installation should be diff --git a/cmake/config.h.cmake b/cmake/config.h.cmake index 9e375f56a2..d9565adb00 100644 --- a/cmake/config.h.cmake +++ b/cmake/config.h.cmake @@ -79,6 +79,11 @@ #cmakedefine EL_VECTOR_WARNINGS #cmakedefine EL_AVOID_OMP_FMA +/* MPI-3 related */ +#cmakedefine EL_ENABLE_RMA_AXPY +#cmakedefine EL_USE_IBARRIER_FOR_AXPY + + #cmakedefine EL_DECLSPEC #ifdef EL_DECLSPEC # define EL_EXPORT __declspec(dllexport) diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index 8f024ba176..4585984111 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -34,19 +34,19 @@ namespace mpi { // consensus instead of El strict EOM matching // see - Scalable communication protocols for // dynamic sparse data exchange by Hoefler, et al -#ifndef EL_USE_IBARRIER_FOR_AXPY -#define EL_USE_IBARRIER_FOR_AXPY -#endif +//#ifndef EL_USE_IBARRIER_FOR_AXPY +//#define EL_USE_IBARRIER_FOR_AXPY +//#endif -#ifndef EL_ENABLE_RMA_AXPY -#define EL_ENABLE_RMA_AXPY -#endif +//#ifndef EL_ENABLE_RMA_AXPY +//#define EL_ENABLE_RMA_AXPY +//#endif // Use derived datatypes for strided // vector communication patterns -#ifndef EL_USE_DERIVED_DATATYPE -#define EL_USE_DERIVED_DATATYPE -#endif +//#ifndef EL_USE_DERIVED_DATATYPE +//#define EL_USE_DERIVED_DATATYPE +//#endif // explicit progress for RMA //#ifndef EL_EXPLICIT_PROGRESS @@ -113,7 +113,7 @@ typedef MPI_Request Request; typedef MPI_Status Status; typedef MPI_Message Message; typedef MPI_User_function UserFunction; -#if MPI_VERSION >= 3 +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) typedef MPI_Win Window; typedef enum { @@ -246,6 +246,17 @@ void Translate ( Comm origComm, int size, const int* origRanks, Comm newComm, int* newRanks ); +// Derived datatype +// ================ +#ifdef EL_USE_DERIVED_DATATYPE +// strided/vector to datatype +void StridedDatatype (El_strided_t* stride_descr, + mpi::Datatype old_type, mpi::Datatype* new_type, + size_t* source_dims); +void VectorDatatype (El_iov_t * vect_descr, + mpi::Datatype old_type, mpi::Datatype * new_type, + vector_pattern_t data_pattern); +#endif // EL_USE_DERIVED_DATATYPE // MPI-3 one-sided // =============== #if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) @@ -254,17 +265,11 @@ void Translate void SetWindowProp ( Window& window, int prop ); void CheckBounds ( Window & window, mpi::Datatype win_type, mpi::Datatype type, size_t count, ptrdiff_t target_offset ); +#ifdef EL_EXPLICIT_PROGRESS void RmaProgress ( Comm comm ); +#endif long ReadInc (Window & win, Aint offset, long inc, int fop_root); - -// strided/vector to datatype -void StridedDatatype (El_strided_t* stride_descr, - mpi::Datatype old_type, mpi::Datatype* new_type, - size_t* source_dims); -void VectorDatatype (El_iov_t * vect_descr, - mpi::Datatype old_type, mpi::Datatype * new_type, - vector_pattern_t data_pattern); // Window creation/update/delete // ----------------------------- void WindowLock( int rank, Window& window ); diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index e48599de9b..8152a72dab 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -403,124 +403,11 @@ void Translate Free (newGroup); } -// MPI-3 RMA functions -// ================== - -#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) -long ReadInc (Window & win, Aint offset, long inc, int fop_root) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::ReadInc")) - long otemp; - SafeMpi ( MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, fop_root, offset, MPI_SUM, - win) ); - SafeMpi ( MPI_Win_flush_local (fop_root, win) ); - - return otemp; -} - -void SetWindowProp (Window & window, int prop) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::SetWindowProp")) - Info info; - - SafeMpi (MPI_Info_create (&info)); - - if (prop & (1 << 0)) // strict - SafeMpi (MPI_Info_set - (info, "accumulate_ordering", - "rar,raw,war,waw")); - - - if (prop & (1 << 1)) // partial - SafeMpi (MPI_Info_set - (info, "accumulate_ordering", - "rar,waw")); - - if (prop & (1 << 2)) // none - SafeMpi (MPI_Info_set - (info, "accumulate_ops", - "same_op_no_op")); - - SafeMpi (MPI_Win_set_info (window, info)); -} - -//NOTE assuming MPI_MODE_NOCHECK -void WindowLock (int rank, Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::WindowLock")) - SafeMpi (MPI_Win_lock - (MPI_LOCK_SHARED, rank, MPI_MODE_NOCHECK, - window)); -} - -void WindowLock (Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::WindowLock")) - SafeMpi (MPI_Win_lock_all - (MPI_MODE_NOCHECK, window)); -} - -void WindowUnlock (int rank, Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::WindowUnlock")) - SafeMpi (MPI_Win_unlock (rank, window)); -} - -void WindowUnlock (Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::WindowUnlock")) - SafeMpi (MPI_Win_unlock_all (window)); -} - -// RMA Utilities -void WindowCreate (void *baseptr, int size, Comm comm, Window & window) -{ - DEBUG_ONLY (CallStackEntry cse ("mpi::WindowCreate")) - - // TODO use alloc_shm - SafeMpi (MPI_Win_create - (baseptr, (MPI_Aint) size, 1, MPI_INFO_NULL, - comm.comm, &window)); -#ifdef EL_NO_ACC_ORDERING - SetWindowProp (window, NO_ACC_ORDERING); -#endif -} - -void CheckBounds (Window & window, Datatype win_type, Datatype type, - size_t count, ptrdiff_t target_offset) -{ - int flag, type_size, win_type_size; - size_t displ; - void * dest=NULL; - - SafeMpi (MPI_Type_size (type, &type_size)); - SafeMpi (MPI_Type_size (win_type, &win_type_size)); - Aint lb, extent; - - SafeMpi (MPI_Win_get_attr(window, MPI_WIN_BASE, dest, &flag /* unused */)); - - /* Calculate displacement from beginning of the window */ - if (dest == MPI_BOTTOM) - displ = 0; - else - displ = (size_t) ((uint8_t*)((uint8_t*)dest + target_offset * type_size) - (uint8_t*)dest); - - SafeMpi (MPI_Type_get_true_extent(type, &lb, &extent)); - - // invalid remote address - assert (displ >= 0 && displ < win_type_size); - // transfer out of range - assert (displ + count*extent <= win_type_size); -} - -void RmaProgress ( Comm comm ) -{ - int flag; - SafeMpi (MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, - comm.comm, &flag, MPI_STATUS_IGNORE)); -} -// TODO these functions for DDT creation are +// DERIVED Datatype creation +// ========================= +// FIXME these functions for DDT creation are // completely untested +#ifdef EL_USE_DERIVED_DATATYPE void StridedDatatype (El_strided_t* stride_descr, Datatype old_type, Datatype* new_type, size_t* source_dims) @@ -647,6 +534,126 @@ void VectorDatatype (El_iov_t * vect_descr, (const int *) vect_descr->sizes, vect_descr->offsets, old_type, new_type) ); } +#endif // EL_USE_DERIVED_DATATYPE + +// MPI-3 RMA functions +// ================== + +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) +long ReadInc (Window & win, Aint offset, long inc, int fop_root) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::ReadInc")) + long otemp; + SafeMpi ( MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, fop_root, offset, MPI_SUM, + win) ); + SafeMpi ( MPI_Win_flush_local (fop_root, win) ); + + return otemp; +} + +void SetWindowProp (Window & window, int prop) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::SetWindowProp")) + Info info; + + SafeMpi (MPI_Info_create (&info)); + + if (prop & (1 << 0)) // strict + SafeMpi (MPI_Info_set + (info, "accumulate_ordering", + "rar,raw,war,waw")); + + + if (prop & (1 << 1)) // partial + SafeMpi (MPI_Info_set + (info, "accumulate_ordering", + "rar,waw")); + + if (prop & (1 << 2)) // none + SafeMpi (MPI_Info_set + (info, "accumulate_ops", + "same_op_no_op")); + + SafeMpi (MPI_Win_set_info (window, info)); +} + +//NOTE assuming MPI_MODE_NOCHECK +void WindowLock (int rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowLock")) + SafeMpi (MPI_Win_lock + (MPI_LOCK_SHARED, rank, MPI_MODE_NOCHECK, + window)); +} + +void WindowLock (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowLock")) + SafeMpi (MPI_Win_lock_all + (MPI_MODE_NOCHECK, window)); +} + +void WindowUnlock (int rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowUnlock")) + SafeMpi (MPI_Win_unlock (rank, window)); +} + +void WindowUnlock (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowUnlock")) + SafeMpi (MPI_Win_unlock_all (window)); +} + +// RMA Utilities +void WindowCreate (void *baseptr, int size, Comm comm, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowCreate")) + + // TODO use alloc_shm + SafeMpi (MPI_Win_create + (baseptr, (MPI_Aint) size, 1, MPI_INFO_NULL, + comm.comm, &window)); +#ifdef EL_NO_ACC_ORDERING + SetWindowProp (window, NO_ACC_ORDERING); +#endif +} + +void CheckBounds (Window & window, Datatype win_type, Datatype type, + size_t count, ptrdiff_t target_offset) +{ + int flag, type_size, win_type_size; + size_t displ; + void * dest=NULL; + + SafeMpi (MPI_Type_size (type, &type_size)); + SafeMpi (MPI_Type_size (win_type, &win_type_size)); + Aint lb, extent; + + SafeMpi (MPI_Win_get_attr(window, MPI_WIN_BASE, dest, &flag /* unused */)); + + /* Calculate displacement from beginning of the window */ + if (dest == MPI_BOTTOM) + displ = 0; + else + displ = (size_t) ((uint8_t*)((uint8_t*)dest + target_offset * type_size) - (uint8_t*)dest); + + SafeMpi (MPI_Type_get_true_extent(type, &lb, &extent)); + + // invalid remote address + assert (displ >= 0 && displ < win_type_size); + // transfer out of range + assert (displ + count*extent <= win_type_size); +} + +#ifdef EL_EXPLICIT_PROGRESS +void RmaProgress ( Comm comm ) +{ + int flag; + SafeMpi (MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, + comm.comm, &flag, MPI_STATUS_IGNORE)); +} +#endif void WindowFree (Window & window) { From ebb4fa685244d4a0d241e80adad21bb2af16e418 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 25 Feb 2015 11:09:32 -0800 Subject: [PATCH 108/110] applied astyle to fix indentation, would do some more commits --- include/El/core/AxpyInterface.hpp | 52 +- include/El/core/AxpyInterface2.0.hpp | 102 +- include/El/core/RmaInterface.hpp | 60 +- src/core/AxpyInterface.cpp | 1331 ++++++------- src/core/AxpyInterface2.0.cpp | 2573 +++++++++++++------------- src/core/RmaInterface.cpp | 1098 ++++++----- 6 files changed, 2563 insertions(+), 2653 deletions(-) diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index d9f9bd8e47..c049de2704 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -7,8 +7,8 @@ This interface is mainly due to Martin Schatz, but it was put into its current form by Jack Poulson. - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ #pragma once @@ -24,16 +24,16 @@ using namespace AxpyTypeNS; template class AxpyInterface -{ +{ public: AxpyInterface(); ~AxpyInterface(); - + AxpyInterface( AxpyType type, DistMatrix& Z ); - AxpyInterface( AxpyType type, const DistMatrix& Z ); + AxpyInterface( AxpyType type, const DistMatrix& Z ); - void Attach( AxpyType type, DistMatrix& Z ); - void Attach( AxpyType type, const DistMatrix& Z ); + void Attach( AxpyType type, DistMatrix& Z ); + void Attach( AxpyType type, const DistMatrix& Z ); void Axpy( T alpha, Matrix& Z, Int i, Int j ); void Axpy( T alpha, const Matrix& Z, Int i, Int j ); @@ -42,16 +42,16 @@ class AxpyInterface private: #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) - static const Int - DATA_TAG =1, - DATA_REQUEST_TAG=2, - DATA_REPLY_TAG =3; + static const Int + DATA_TAG =1, + DATA_REQUEST_TAG=2, + DATA_REPLY_TAG =3; #else - static const Int - DATA_TAG =1, - EOM_TAG =2, - DATA_REQUEST_TAG=3, - DATA_REPLY_TAG =4; + static const Int + DATA_TAG =1, + EOM_TAG =2, + DATA_REQUEST_TAG=3, + DATA_REPLY_TAG =4; #endif //request object for polling on Issends @@ -64,17 +64,17 @@ class AxpyInterface #else std::vector sentEomTo_, haveEomFrom_; std::vector eomSendRequests_; -#endif - - std::vector> - sendingData_, sendingRequest_, sendingReply_; - std::vector> - dataSendRequests_, requestSendRequests_, replySendRequests_; - +#endif + + std::vector> + sendingData_, sendingRequest_, sendingReply_; + std::vector> + dataSendRequests_, requestSendRequests_, replySendRequests_; + std::vector recvVector_; std::vector>> - dataVectors_, requestVectors_, replyVectors_; - + dataVectors_, requestVectors_, replyVectors_; + byte sendDummy_, recvDummy_; // Progress functions @@ -92,7 +92,7 @@ class AxpyInterface Int ReadyForSend ( Int sendSize, std::deque>& sendVectors, - std::deque& requests, + std::deque& requests, std::deque& requestStatuses ); void HandleLocalToGlobalData(); diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp index d0b63845a2..9a54c61dfc 100644 --- a/include/El/core/AxpyInterface2.0.hpp +++ b/include/El/core/AxpyInterface2.0.hpp @@ -24,75 +24,75 @@ class AxpyInterface2 void Detach(); // remote update routines - - // requires Flush for local+remote + + // requires Flush for local+remote // completion - void Iput( Matrix& Z, Int i, Int j ); + void Iput( Matrix& Z, Int i, Int j ); void Iput( const Matrix& Z, Int i, Int j ); void Iget( Matrix& Z, Int i, Int j ); void Iacc( Matrix& Z, Int i, Int j ); void Iacc( const Matrix& Z, Int i, Int j ); - + // locally blocking update routines // reuse input buffer when returns void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); - void Put( Matrix& Z, Int i, Int j ); - void Put( const Matrix& Z, Int i, Int j ); + void Put( Matrix& Z, Int i, Int j ); + void Put( const Matrix& Z, Int i, Int j ); // End to End blocking // will be deprecated soon void Eacc( Matrix& Z, Int i, Int j ); void Eacc( const Matrix& Z, Int i, Int j ); - void Eput( Matrix& Z, Int i, Int j ); + void Eput( Matrix& Z, Int i, Int j ); void Eput( const Matrix& Z, Int i, Int j ); void Get( Matrix& Z, Int i, Int j ); // synchronization routines void Flush( Matrix& Z ); - void Flush( const Matrix& Z ); - + void Flush( const Matrix& Z ); + private: - - static const Int - DATA_PUT_TAG =1, - DATA_GET_TAG =2, - DATA_ACC_TAG =3, - REQUEST_GET_TAG =4, - COORD_ACC_TAG =5, - COORD_PUT_TAG =6; + + static const Int + DATA_PUT_TAG =1, + DATA_GET_TAG =2, + DATA_ACC_TAG =3, + REQUEST_GET_TAG =4, + COORD_ACC_TAG =5, + COORD_PUT_TAG =6; // struct for passing data struct matrix_params_ { - const void *base_; - std::vector>> - data_; - std::vector> - requests_; - std::vector> - statuses_; + const void *base_; + std::vector>> + data_; + std::vector> + requests_; + std::vector> + statuses_; }; - + std::vector matrices_; // struct for passing coordinates struct coord_params_ { - const void *base_; - std::vector>> - coord_; - std::vector> - requests_; - std::vector> - statuses_; + const void *base_; + std::vector>> + coord_; + std::vector> + requests_; + std::vector> + statuses_; }; - + std::vector coords_; // for blocking interface @@ -100,43 +100,43 @@ class AxpyInterface2 // intermediate buffer so that input // buffer could be reused std::vector>> - dataVectors_; + dataVectors_; - DistMatrix* GlobalArrayPut_; + DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; - bool toBeAttachedForPut_, toBeAttachedForGet_, - attached_, detached_; + bool toBeAttachedForPut_, toBeAttachedForGet_, + attached_, detached_; // next index for data and coord Int NextIndexData ( - Int target, - Int dataSize, - const void* base_address, - Int *mindex); - + Int target, + Int dataSize, + const void* base_address, + Int *mindex ); + Int NextIndexCoord ( - Int i, Int j, - Int target, - const void* base_address, - Int *cindex); + Int i, Int j, + Int target, + const void* base_address, + Int *cindex ); bool Testall(); bool Test( Matrix& Z ); - bool Test( const Matrix& Z ); + bool Test( const Matrix& Z ); bool TestAny( Matrix& Z ); - bool TestAny( const Matrix& Z ); + bool TestAny( const Matrix& Z ); void Waitall(); void Wait( Matrix& Z ); - void Wait( const Matrix& Z ); + void Wait( const Matrix& Z ); void WaitAny( Matrix& Z ); - void WaitAny( const Matrix& Z ); + void WaitAny( const Matrix& Z ); // these are only used for nonblocking // update rountines void HandleGlobalToLocalData( Matrix& Z ); - + void HandleLocalToGlobalData( Matrix& Z, Int source ); void HandleLocalToGlobalAcc( Matrix& Z, Int source ); diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp index d47a9233ac..9587081bbe 100644 --- a/include/El/core/RmaInterface.hpp +++ b/include/El/core/RmaInterface.hpp @@ -31,7 +31,7 @@ class RmaInterface void Attach( const DistMatrix& Z ); // Local completion - void Put( Matrix& Z, Int i, Int j ); + void Put( Matrix& Z, Int i, Int j ); void Put( const Matrix& Z, Int i, Int j ); void Get( Matrix& Z, Int i, Int j ); @@ -39,15 +39,15 @@ class RmaInterface void Acc( Matrix& Z, Int i, Int j ); void Acc( const Matrix& Z, Int i, Int j ); - // No local completion - void Iput( Matrix& Z, Int i, Int j ); + // No local completion + void Iput( Matrix& Z, Int i, Int j ); void Iput( const Matrix& Z, Int i, Int j ); void Iacc( Matrix& Z, Int i, Int j ); void Iacc( const Matrix& Z, Int i, Int j ); // Request based RMA - void Rput( Matrix& Z, Int i, Int j ); + void Rput( Matrix& Z, Int i, Int j ); void Rput( const Matrix& Z, Int i, Int j ); void Racc( Matrix& Z, Int i, Int j ); @@ -63,61 +63,61 @@ class RmaInterface void Detach(); private: - + mpi::Window window; // struct for passing data // for request based rma struct matrix_params_ { - const void *base_; - std::vector>> - data_; - std::vector> - requests_; - std::vector> - statuses_; + const void *base_; + std::vector>> + data_; + std::vector> + requests_; + std::vector> + statuses_; }; - + std::vector matrices_; - // buffers for rma + // buffers for rma std::vector>> - getVector_, putVector_; + getVector_, putVector_; DistMatrix* GlobalArrayPut_; const DistMatrix* GlobalArrayGet_; - - bool toBeAttachedForPut_, toBeAttachedForGet_, - attached_, detached_; + + bool toBeAttachedForPut_, toBeAttachedForGet_, + attached_, detached_; // next index for data - Int NextIndex ( - Int dataSize, - std::deque > &dataVectors ); + Int NextIndex ( + Int dataSize, + std::deque >& dataVectors ); Int NextIndex ( - Int target, - Int dataSize, - const void* base_address, - Int* mindex); + Int target, + Int dataSize, + const void* base_address, + Int* mindex ); // only relevant for request-based // passive RMA bool anyPendingXfers ( Matrix& Z ); bool anyPendingXfers ( const Matrix& Z ); - + bool Testall(); bool Test( Matrix& Z ); - bool Test( const Matrix& Z ); + bool Test( const Matrix& Z ); bool TestAny( Matrix& Z ); - bool TestAny( const Matrix& Z ); + bool TestAny( const Matrix& Z ); void Waitall(); void Wait( Matrix& Z ); - void Wait( const Matrix& Z ); + void Wait( const Matrix& Z ); void WaitAny( Matrix& Z ); - void WaitAny( const Matrix& Z ); + void WaitAny( const Matrix& Z ); }; #endif // EL_ENABLE_RMA_AXPY } // namespace El diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index e9deaf4b0f..ea3a38880d 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -7,8 +7,8 @@ This interface is mainly due to Martin Schatz, but it was put into its current form by Jack Poulson. - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ #include "El.hpp" @@ -16,259 +16,270 @@ namespace El { #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) -#else - template < typename T > bool AxpyInterface < T >::Finished () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Finished"); - if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) - LogicError ("Not attached");) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); +#else +template bool AxpyInterface ::Finished () +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Finished" ); + if ( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) + LogicError ( "Not attached" ); ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid () ); const Int p = g.Size (); - bool finished = true; - for (Int rank = 0; rank < p; ++rank) - { - if (!sentEomTo_[rank] || !haveEomFrom_[rank]) - { - finished = false; - break; - } - } + + for ( Int rank = 0; rank < p; ++rank ) + { + if ( !sentEomTo_[rank] || !haveEomFrom_[rank] ) + { + finished = false; + break; + } + } + return finished; - } - - template < typename T > void AxpyInterface < T >::HandleEoms () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleEoms")) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); - const Int p = g.Size (); +} +template void AxpyInterface ::HandleEoms () +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::HandleEoms" ) ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid () ); + const Int p = g.Size (); UpdateRequestStatuses (); // Try to progress our EOM sends - for (Int i = 0; i < p; ++i) - { - if (!sentEomTo_[i]) - { - bool shouldSendEom = true; - const Int numSends = sendingData_[i].size (); - for (Int j = 0; j < numSends; ++j) - { - if (sendingData_[i][j]) - { - shouldSendEom = false; - break; - } - } - const Int numRequests = sendingRequest_[i].size (); - for (Int j = 0; j < numRequests; ++j) - { - if (!shouldSendEom || sendingRequest_[i][j]) - { - shouldSendEom = false; - break; - } - } - const Int numReplies = sendingReply_[i].size (); - for (Int j = 0; j < numReplies; ++j) - { - if (!shouldSendEom || sendingReply_[i][j]) - { - shouldSendEom = false; - break; - } - } - if (shouldSendEom) - { - mpi::Request & request = eomSendRequests_[i]; - mpi::TaggedISSend - (&sendDummy_, 1, i, EOM_TAG, g.VCComm (), request); - sentEomTo_[i] = true; - } - } - } + for ( Int i = 0; i < p; ++i ) + { + if ( !sentEomTo_[i] ) + { + bool shouldSendEom = true; + const Int numSends = sendingData_[i].size (); + + for ( Int j = 0; j < numSends; ++j ) + { + if ( sendingData_[i][j] ) + { + shouldSendEom = false; + break; + } + } + + const Int numRequests = sendingRequest_[i].size (); + + for ( Int j = 0; j < numRequests; ++j ) + { + if ( !shouldSendEom || sendingRequest_[i][j] ) + { + shouldSendEom = false; + break; + } + } + + const Int numReplies = sendingReply_[i].size (); + + for ( Int j = 0; j < numReplies; ++j ) + { + if ( !shouldSendEom || sendingReply_[i][j] ) + { + shouldSendEom = false; + break; + } + } + + if ( shouldSendEom ) + { + mpi::Request& request = eomSendRequests_[i]; + mpi::TaggedISSend + ( &sendDummy_, 1, i, EOM_TAG, g.VCComm (), request ); + sentEomTo_[i] = true; + } + } + } + mpi::Status status; - if (mpi::IProbe (mpi::ANY_SOURCE, EOM_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - mpi::TaggedRecv (&recvDummy_, 1, source, EOM_TAG, g.VCComm ()); - haveEomFrom_[source] = true; - } - } -#endif - template < typename T > void AxpyInterface < T >::HandleLocalToGlobalData () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleLocalToGlobalData")) - DistMatrix < T > &Y = *localToGlobalMat_; - const Grid & g = Y.Grid (); + if ( mpi::IProbe ( mpi::ANY_SOURCE, EOM_TAG, g.VCComm (), status ) ) + { + const Int source = status.MPI_SOURCE; + mpi::TaggedRecv ( &recvDummy_, 1, source, EOM_TAG, g.VCComm () ); + haveEomFrom_[source] = true; + } +} +#endif // EL_USE_IBARRIER_FOR_AXPY + +template void AxpyInterface ::HandleLocalToGlobalData () +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::HandleLocalToGlobalData" ) ) + DistMatrix & Y = *localToGlobalMat_; + const Grid& g = Y.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int myRow = g.Row (); const Int myCol = g.Col (); mpi::Status status; - if (mpi::IProbe (mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status)) - { - // Message exists, so recv and pack - const Int count = mpi::GetCount < byte > (status); - DEBUG_ONLY (if (count < Int (4 * sizeof (Int) + sizeof (T))) - LogicError ("Count was too small");) - const Int source = status.MPI_SOURCE; - recvVector_.resize (count); - byte *recvBuffer = recvVector_.data (); - mpi::TaggedRecv (recvBuffer, count, source, DATA_TAG, g.VCComm ()); - // Extract the header - byte *head = recvBuffer; - const Int i = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int j = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int height = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int width = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const T alpha = *reinterpret_cast < const T * >(head); - head += sizeof (T); - DEBUG_ONLY (if (height < 0 || width < 0) - RuntimeError - ("Unpacked heights were negative:\n", - " i= ", i, std::hex, "(", i, ")\n", std::dec, - " j= ", j, std::hex, "(", j, ")\n", std::dec, - " height=", height, std::hex, "(", height, ")\n", - std::dec, " width= ", width, std::hex, "(", width, - ")\n", std::dec, " alpha= ", alpha); - if (i < 0 - || j < - 0) RuntimeError ("Unpacked offsets were negative:\n", - " i= ", i, std::hex, "(", i, - ")\n", std::dec, " j= ", j, - std::hex, "(", j, ")\n", std::dec, - " height=", height, std::hex, "(", - height, ")\n", std::dec, " width= ", - width, std::hex, "(", width, ")\n", - std::dec, " alpha= ", alpha); - if (i + height > Y.Height () - || j + width > - Y.Width ())RuntimeError - ("Unpacked submatrix was out of bounds:\n", " i= ", - i, std::hex, "(", i, ")\n", std::dec, " j= ", j, - std::hex, "(", j, ")\n", std::dec, " height=", height, - std::hex, "(", height, ")\n", std::dec, " width= ", - width, std::hex, "(", width, ")\n", std::dec, - " alpha= ", alpha);) - - // Update Y - const T *XBuffer = reinterpret_cast < const T * >(head); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); - - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += alpha * XCol[s]; - } - // Free the memory for the recv buffer - recvVector_.clear (); - } - } - - template < typename T > - void AxpyInterface < T >::HandleGlobalToLocalRequest () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalRequest")) - const DistMatrix < T > &X = *globalToLocalMat_; - const Grid & g = X.Grid (); + if ( mpi::IProbe ( mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status ) ) + { + // Message exists, so recv and pack + const Int count = mpi::GetCount ( status ); + + DEBUG_ONLY ( if ( count < Int ( 4 * sizeof ( Int ) + sizeof ( T ) ) ) + LogicError ( "Count was too small" ); ) + ; + const Int source = status.MPI_SOURCE; + + recvVector_.resize ( count ); + byte* recvBuffer = recvVector_.data (); + mpi::TaggedRecv ( recvBuffer, count, source, DATA_TAG, g.VCComm () ); + // Extract the header + byte* head = recvBuffer; + const Int i = *reinterpret_cast ( head ); + head += sizeof ( Int ); + const Int j = *reinterpret_cast ( head ); + head += sizeof ( Int ); + const Int height = *reinterpret_cast ( head ); + head += sizeof ( Int ); + const Int width = *reinterpret_cast ( head ); + head += sizeof ( Int ); + const T alpha = *reinterpret_cast ( head ); + head += sizeof ( T ); + + DEBUG_ONLY ( if ( height < 0 || width < 0 ) + RuntimeError + ( "Unpacked heights were negative:\n", + " i= ", i, std::hex, "(", i, ")\n", std::dec, + " j= ", j, std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", height, ")\n", + std::dec, " width= ", width, std::hex, "(", width, + ")\n", std::dec, " alpha= ", alpha ); + if ( i < 0 || j < 0 ) + RuntimeError + ( "Unpacked offsets were negative:\n", + " i= ", i, std::hex, "(", i, ")\n", std::dec, + " j= ", j, std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", height, ")\n", + std::dec, " width= ", width, std::hex, "(", width, + ")\n", std::dec, " alpha= ", alpha ); + if ( i + height > Y.Height () || j + width > Y.Width () ) + RuntimeError + ( "Unpacked submatrix was out of bounds:\n", + " i= ", i, std::hex, "(", i, ")\n", std::dec, + " j= ", j, std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", height, ")\n", + std::dec, " width= ", width, std::hex, "(", width, ")\n", + std::dec, " alpha= ", alpha ); ) + + // Update Y + const T* XBuffer = reinterpret_cast ( head ); + const Int colAlign = ( Y.ColAlign () + i ) % r; + const Int rowAlign = ( Y.RowAlign () + j ) % c; + const Int colShift = Shift ( myRow, colAlign, r ); + const Int rowShift = Shift ( myCol, rowAlign, c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + const Int iLocalOffset = Length ( i, Y.ColShift (), r ); + const Int jLocalOffset = Length ( j, Y.RowShift (), c ); + + for ( Int t = 0; t < localWidth; ++t ) + { + T* YCol = Y.Buffer ( iLocalOffset, jLocalOffset + t ); + const T* XCol = &XBuffer[t * localHeight]; + + for ( Int s = 0; s < localHeight; ++s ) + YCol[s] += alpha * XCol[s]; + } + + // Free the memory for the recv buffer + recvVector_.clear (); + } +} + +template +void AxpyInterface ::HandleGlobalToLocalRequest () +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::HandleGlobalToLocalRequest" ) ) + const DistMatrix & X = *globalToLocalMat_; + const Grid& g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int myRow = g.Row (); const Int myCol = g.Col (); - mpi::Status status; - if (mpi::IProbe (mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm (), status)) - { - // Request exists, so recv - const Int source = status.MPI_SOURCE; - const Int recvSize = 4 * sizeof (Int); - recvVector_.resize (recvSize); - byte *recvBuffer = recvVector_.data (); - mpi::TaggedRecv - (recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm ()); + if ( mpi::IProbe ( mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm (), status ) ) + { + // Request exists, so recv + const Int source = status.MPI_SOURCE; + const Int recvSize = 4 * sizeof ( Int ); + recvVector_.resize ( recvSize ); + + byte* recvBuffer = recvVector_.data (); + mpi::TaggedRecv + ( recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm () ); + // Extract the header - const byte *recvHead = recvBuffer; - const Int i = *reinterpret_cast < const Int * >(recvHead); - recvHead += sizeof (Int); - const Int j = *reinterpret_cast < const Int * >(recvHead); - recvHead += sizeof (Int); - const Int height = *reinterpret_cast < const Int * >(recvHead); - recvHead += sizeof (Int); - const Int width = *reinterpret_cast < const Int * >(recvHead); - recvHead += sizeof (Int); - - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int numEntries = localHeight * localWidth; - - const Int bufferSize = 2 * sizeof (Int) + numEntries * sizeof (T); - const Int index = ReadyForSend (bufferSize, replyVectors_[source], - replySendRequests_[source], - sendingReply_[source]); - // Pack the reply header - byte *sendBuffer = replyVectors_[source][index].data (); - byte *sendHead = sendBuffer; - *reinterpret_cast < Int * >(sendHead) = myRow; - sendHead += sizeof (Int); - *reinterpret_cast < Int * >(sendHead) = myCol; - sendHead += sizeof (Int); - - // Pack the payload - T *sendData = reinterpret_cast < T * >(sendHead); - for (Int t = 0; t < localWidth; ++t) - { - T *sendCol = &sendData[t * localHeight]; - const T *XCol = X.LockedBuffer (iLocalOffset, jLocalOffset + t); - MemCopy (sendCol, XCol, localHeight); - } - // Fire off non-blocking send - mpi::TaggedISSend - (sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), - replySendRequests_[source][index]); - } - } + const byte* recvHead = recvBuffer; + const Int i = *reinterpret_cast ( recvHead ); + recvHead += sizeof ( Int ); + const Int j = *reinterpret_cast ( recvHead ); + recvHead += sizeof ( Int ); + const Int height = *reinterpret_cast ( recvHead ); + recvHead += sizeof ( Int ); + const Int width = *reinterpret_cast ( recvHead ); + recvHead += sizeof ( Int ); + + const Int colAlign = ( X.ColAlign () + i ) % r; + const Int rowAlign = ( X.RowAlign () + j ) % c; + const Int colShift = Shift ( myRow, colAlign, r ); + const Int rowShift = Shift ( myCol, rowAlign, c ); + const Int iLocalOffset = Length ( i, X.ColShift (), r ); + const Int jLocalOffset = Length ( j, X.RowShift (), c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + const Int numEntries = localHeight * localWidth; + const Int bufferSize = 2 * sizeof ( Int ) + numEntries * sizeof ( T ); + const Int index = ReadyForSend ( bufferSize, replyVectors_[source], + replySendRequests_[source], + sendingReply_[source] ); + // Pack the reply header + byte* sendBuffer = replyVectors_[source][index].data (); + byte* sendHead = sendBuffer; + *reinterpret_cast ( sendHead ) = myRow; + sendHead += sizeof ( Int ); + *reinterpret_cast ( sendHead ) = myCol; + sendHead += sizeof ( Int ); + // Pack the payload + T* sendData = reinterpret_cast ( sendHead ); + + for ( Int t = 0; t < localWidth; ++t ) + { + T* sendCol = &sendData[t * localHeight]; + const T* XCol = X.LockedBuffer ( iLocalOffset, jLocalOffset + t ); + MemCopy ( sendCol, XCol, localHeight ); + } + + // Fire off non-blocking send + mpi::TaggedISSend + ( sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), + replySendRequests_[source][index] ); + } +} template AxpyInterface::AxpyInterface() -: attachedForLocalToGlobal_(false), attachedForGlobalToLocal_(false), - localToGlobalMat_(0), globalToLocalMat_(0), - sendDummy_(0), recvDummy_(0) + : attachedForLocalToGlobal_( false ), attachedForGlobalToLocal_( false ), + localToGlobalMat_( 0 ), globalToLocalMat_( 0 ), + sendDummy_( 0 ), recvDummy_( 0 ) { } template AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) -: sendDummy_(0), recvDummy_(0) + : sendDummy_( 0 ), recvDummy_( 0 ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::AxpyInterface" ) ) + if( type == LOCAL_TO_GLOBAL ) { attachedForLocalToGlobal_ = true; @@ -285,22 +296,20 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) } const Int p = Z.Grid().Size(); - #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else sentEomTo_.resize( p, false ); haveEomFrom_.resize( p, false ); eomSendRequests_.resize( p ); #endif - sendingData_.resize( p ); sendingRequest_.resize( p ); sendingReply_.resize( p ); - + dataSendRequests_.resize( p ); requestSendRequests_.resize( p ); replySendRequests_.resize( p ); - + dataVectors_.resize( p ); requestVectors_.resize( p ); replyVectors_.resize( p ); @@ -308,467 +317,481 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) template AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) -: sendDummy_(0), recvDummy_(0) + : sendDummy_( 0 ), recvDummy_( 0 ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::AxpyInterface" ) ) + if( type == LOCAL_TO_GLOBAL ) + LogicError( "Cannot update a constant matrix" ); + else { - LogicError("Cannot update a constant matrix"); + attachedForLocalToGlobal_ = false; + attachedForGlobalToLocal_ = true; + localToGlobalMat_ = 0; + globalToLocalMat_ = &X; } - else - { - attachedForLocalToGlobal_ = false; - attachedForGlobalToLocal_ = true; - localToGlobalMat_ = 0; - globalToLocalMat_ = &X; - } const Int p = X.Grid ().Size (); - #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else - sentEomTo_.resize (p, false); - haveEomFrom_.resize (p, false); - eomSendRequests_.resize (p); + sentEomTo_.resize ( p, false ); + haveEomFrom_.resize ( p, false ); + eomSendRequests_.resize ( p ); #endif + sendingData_.resize ( p ); + sendingRequest_.resize ( p ); + sendingReply_.resize ( p ); + + dataSendRequests_.resize ( p ); + requestSendRequests_.resize ( p ); + replySendRequests_.resize ( p ); + + dataVectors_.resize ( p ); + requestVectors_.resize ( p ); + replyVectors_.resize ( p ); +} + +template AxpyInterface ::~AxpyInterface () +{ + if ( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) + { + if ( std::uncaught_exception () ) + { + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid () ); + std::ostringstream os; + os << g.Rank () + << + "Uncaught exception detected during AxpyInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str (); + DEBUG_ONLY ( DumpCallStack () ) + } + else + Detach (); + } +} + +template +void AxpyInterface ::Attach ( AxpyType type, DistMatrix & Z ) +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Attach" ) ) + + if ( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) + LogicError ( "Must detach before reattaching." ); - sendingData_.resize (p); - sendingRequest_.resize (p); - sendingReply_.resize (p); - - dataSendRequests_.resize (p); - requestSendRequests_.resize (p); - replySendRequests_.resize (p); - - dataVectors_.resize (p); - requestVectors_.resize (p); - replyVectors_.resize (p); - } - - template < typename T > AxpyInterface < T >::~AxpyInterface () - { - if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) - { - if (std::uncaught_exception ()) - { - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); - std::ostringstream os; - os << g.Rank () - << - "Uncaught exception detected during AxpyInterface destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << std::endl; - std::cerr << os.str (); - DEBUG_ONLY (DumpCallStack ())} - else - { - Detach (); - } - } - } - - template < typename T > - void AxpyInterface < T >::Attach (AxpyType type, DistMatrix < T > &Z) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Attach")) - if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) - LogicError ("Must detach before reattaching."); - - const Grid & g = Z.Grid (); - - if (type == LOCAL_TO_GLOBAL) - { - attachedForLocalToGlobal_ = true; - localToGlobalMat_ = &Z; - } + const Grid& g = Z.Grid (); + + if ( type == LOCAL_TO_GLOBAL ) + { + attachedForLocalToGlobal_ = true; + localToGlobalMat_ = &Z; + } else - { - attachedForGlobalToLocal_ = true; - globalToLocalMat_ = &Z; - } - const Int p = Z.Grid ().Size (); + { + attachedForGlobalToLocal_ = true; + globalToLocalMat_ = &Z; + } + const Int p = Z.Grid ().Size (); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else - sentEomTo_.resize (p, false); - haveEomFrom_.resize (p, false); - eomSendRequests_.resize (p); -#endif - + sentEomTo_.resize ( p, false ); + haveEomFrom_.resize ( p, false ); + eomSendRequests_.resize ( p ); +#endif // request objects - sendingRequest_.resize (p); - sendingData_.resize (p); - sendingReply_.resize (p); - - // ready-to-send - requestSendRequests_.resize (p); - replySendRequests_.resize (p); - dataSendRequests_.resize (p); - - dataVectors_.resize (p); - requestVectors_.resize (p); - replyVectors_.resize (p); - } - - template < typename T > - void AxpyInterface < T >::Attach (AxpyType type, - const DistMatrix < T > &X) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Attach")) - if (attachedForLocalToGlobal_ || attachedForGlobalToLocal_) - LogicError ("Must detach before reattaching."); - - if (type == LOCAL_TO_GLOBAL) - { - LogicError ("Cannot update a constant matrix"); - } + sendingRequest_.resize ( p ); + sendingData_.resize ( p ); + sendingReply_.resize ( p ); + // ready-to-send + requestSendRequests_.resize ( p ); + replySendRequests_.resize ( p ); + dataSendRequests_.resize ( p ); + // data + dataVectors_.resize ( p ); + requestVectors_.resize ( p ); + replyVectors_.resize ( p ); +} + +template +void AxpyInterface ::Attach ( AxpyType type, + const DistMatrix & X ) +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Attach" ) ) + + if ( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) + LogicError ( "Must detach before reattaching." ); + + if ( type == LOCAL_TO_GLOBAL ) + LogicError ( "Cannot update a constant matrix" ); else - { - attachedForGlobalToLocal_ = true; - globalToLocalMat_ = &X; - } - const Int p = X.Grid ().Size (); + { + attachedForGlobalToLocal_ = true; + globalToLocalMat_ = &X; + } + const Int p = X.Grid ().Size (); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) -#else - // eom - sentEomTo_.resize (p, false); - haveEomFrom_.resize (p, false); - eomSendRequests_.resize (p); +#else + // eom + sentEomTo_.resize ( p, false ); + haveEomFrom_.resize ( p, false ); + eomSendRequests_.resize ( p ); #endif + // ready-to-send + sendingRequest_.resize ( p ); + sendingData_.resize ( p ); + sendingReply_.resize ( p ); + // ready-to-send + requestSendRequests_.resize ( p ); + replySendRequests_.resize ( p ); + dataSendRequests_.resize ( p ); + // data + dataVectors_.resize ( p ); + replyVectors_.resize ( p ); + requestVectors_.resize ( p ); +} - // ready-to-send - sendingRequest_.resize (p); - sendingData_.resize (p); - sendingReply_.resize (p); - - // ready-to-send - requestSendRequests_.resize (p); - replySendRequests_.resize (p); - dataSendRequests_.resize (p); - - dataVectors_.resize (p); - replyVectors_.resize (p); - requestVectors_.resize (p); - } - - template < typename T > - void AxpyInterface < T >::Axpy (T alpha, Matrix < T > &Z, Int i, Int j) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Axpy")) - if (attachedForLocalToGlobal_) - AxpyLocalToGlobal (alpha, Z, i, j); - else if (attachedForGlobalToLocal_) - AxpyGlobalToLocal (alpha, Z, i, j); +template +void AxpyInterface ::Axpy ( T alpha, Matrix & Z, Int i, Int j ) +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Axpy" ) ) + + if ( attachedForLocalToGlobal_ ) + AxpyLocalToGlobal ( alpha, Z, i, j ); + else if ( attachedForGlobalToLocal_ ) + AxpyGlobalToLocal ( alpha, Z, i, j ); else - LogicError ("Cannot axpy before attaching."); - } - - template < typename T > - void AxpyInterface < T >::Axpy (T alpha, const Matrix < T > &Z, Int i, - Int j) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Axpy")) - if (attachedForLocalToGlobal_) - AxpyLocalToGlobal (alpha, Z, i, j); - else if (attachedForGlobalToLocal_) - LogicError ("Cannot update a constant matrix."); + LogicError ( "Cannot axpy before attaching." ); +} + +template +void AxpyInterface ::Axpy ( T alpha, const Matrix & Z, Int i, + Int j ) +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Axpy" ) ) + + if ( attachedForLocalToGlobal_ ) + AxpyLocalToGlobal ( alpha, Z, i, j ); + else if ( attachedForGlobalToLocal_ ) + LogicError ( "Cannot update a constant matrix." ); else - LogicError ("Cannot axpy before attaching."); - } + LogicError ( "Cannot axpy before attaching." ); +} // Update Y(i:i+height-1,j:j+width-1) += alpha X, where X is height x width - template < typename T > - void AxpyInterface < T >::AxpyLocalToGlobal - (T alpha, const Matrix < T > &X, Int i, Int j) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyLocalToGlobal")) - DistMatrix < T > &Y = *localToGlobalMat_; - if (i < 0 || j < 0) - LogicError ("Submatrix offsets must be non-negative"); - if (i + X.Height () > Y.Height () || j + X.Width () > Y.Width ()) - LogicError ("Submatrix out of bounds of global matrix"); - - const Grid & g = Y.Grid (); +template +void AxpyInterface ::AxpyLocalToGlobal +( T alpha, const Matrix & X, Int i, Int j ) +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::AxpyLocalToGlobal" ) ) + DistMatrix & Y = *localToGlobalMat_; + + if ( i < 0 || j < 0 ) + LogicError ( "Submatrix offsets must be non-negative" ); + + if ( i + X.Height () > Y.Height () || j + X.Width () > Y.Width () ) + LogicError ( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int p = g.Size (); const Int myProcessRow = g.Row (); const Int myProcessCol = g.Col (); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; - + const Int colAlign = ( Y.ColAlign () + i ) % r; + const Int rowAlign = ( Y.RowAlign () + j ) % c; const Int height = X.Height (); const Int width = X.Width (); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - for (Int step = 0; step < p; ++step) - { - const Int colShift = Shift (receivingRow, colAlign, r); - const Int rowShift = Shift (receivingCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int numEntries = localHeight * localWidth; - - if (numEntries != 0) - { - const Int destination = receivingRow + r * receivingCol; - const Int bufferSize = - 4 * sizeof (Int) + (numEntries + 1) * sizeof (T); - const Int index = - ReadyForSend (bufferSize, dataVectors_[destination], - dataSendRequests_[destination], - sendingData_[destination]); - - DEBUG_ONLY (if - (Int (dataVectors_[destination][index].size ()) != - bufferSize) LogicError ("Error in ReadyForSend");) - // Pack the header - byte *sendBuffer = dataVectors_[destination][index].data (); - byte *head = sendBuffer; - *reinterpret_cast < Int * >(head) = i; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = j; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = height; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = width; - head += sizeof (Int); - *reinterpret_cast < T * >(head) = alpha; - head += sizeof (T); - - // Pack the payload - T *sendData = reinterpret_cast < T * >(head); - const T *XBuffer = X.LockedBuffer (); - const Int XLDim = X.LDim (); - for (Int t = 0; t < localWidth; ++t) - { - T *thisSendCol = &sendData[t * localHeight]; - const T *thisXCol = &XBuffer[(rowShift + t * c) * XLDim]; - for (Int s = 0; s < localHeight; ++s) - thisSendCol[s] = thisXCol[colShift + s * r]; - } - // Fire off the non-blocking send - mpi::TaggedISSend - (sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), - dataSendRequests_[destination][index]); - } - receivingRow = (receivingRow + 1) % r; - if (receivingRow == 0) - receivingCol = (receivingCol + 1) % c; - } - } -// Update Y += alpha X(i:i+height-1,j:j+width-1), where X is the dist-matrix - template < typename T > - void AxpyInterface < T >::AxpyGlobalToLocal (T alpha, Matrix < T > &Y, - Int i, Int j) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::AxpyGlobalToLocal")) - const DistMatrix < T > &X = *globalToLocalMat_; + for ( Int step = 0; step < p; ++step ) + { + const Int colShift = Shift ( receivingRow, colAlign, r ); + const Int rowShift = Shift ( receivingCol, rowAlign, c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + const Int numEntries = localHeight * localWidth; + + if ( numEntries != 0 ) + { + const Int destination = receivingRow + r * receivingCol; + const Int bufferSize = + 4 * sizeof ( Int ) + ( numEntries + 1 ) * sizeof ( T ); + const Int index = + ReadyForSend ( bufferSize, dataVectors_[destination], + dataSendRequests_[destination], + sendingData_[destination] ); + + DEBUG_ONLY ( if + ( Int ( dataVectors_[destination][index].size () ) != + bufferSize ) LogicError ( "Error in ReadyForSend" ); ) + // Pack the header + byte* sendBuffer = dataVectors_[destination][index].data (); + + byte* head = sendBuffer; + *reinterpret_cast ( head ) = i; + head += sizeof ( Int ); + *reinterpret_cast ( head ) = j; + head += sizeof ( Int ); + *reinterpret_cast ( head ) = height; + head += sizeof ( Int ); + *reinterpret_cast ( head ) = width; + head += sizeof ( Int ); + *reinterpret_cast ( head ) = alpha; + head += sizeof ( T ); + // Pack the payload + T* sendData = reinterpret_cast ( head ); + const T* XBuffer = X.LockedBuffer (); + const Int XLDim = X.LDim (); + + for ( Int t = 0; t < localWidth; ++t ) + { + T* thisSendCol = &sendData[t * localHeight]; + const T* thisXCol = &XBuffer[( rowShift + t * c ) * XLDim]; + + for ( Int s = 0; s < localHeight; ++s ) + thisSendCol[s] = thisXCol[colShift + s * r]; + } + + // Fire off the non-blocking send + mpi::TaggedISSend + ( sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), + dataSendRequests_[destination][index] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if ( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; + } +} +// Update Y += alpha X(i:i+height-1,j:j+width-1), where X is the dist-matrix +template +void AxpyInterface ::AxpyGlobalToLocal ( T alpha, Matrix & Y, + Int i, Int j ) +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::AxpyGlobalToLocal" ) ) + const DistMatrix & X = *globalToLocalMat_; const Int height = Y.Height (); const Int width = Y.Width (); - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid AxpyGlobalToLocal submatrix"); - const Grid & g = X.Grid (); + if ( i + height > X.Height () || j + width > X.Width () ) + LogicError ( "Invalid AxpyGlobalToLocal submatrix" ); + + const Grid& g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int p = g.Size (); // Send out the requests to all processes in the grid - for (Int rank = 0; rank < p; ++rank) - { - const Int bufferSize = 4 * sizeof (Int); - const Int index = ReadyForSend (bufferSize, requestVectors_[rank], - requestSendRequests_[rank], - sendingRequest_[rank]); - // Copy the request header into the send buffer - byte *sendBuffer = requestVectors_[rank][index].data (); - byte *head = sendBuffer; - *reinterpret_cast < Int * >(head) = i; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = j; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = height; - head += sizeof (Int); - *reinterpret_cast < Int * >(head) = width; - head += sizeof (Int); - - // Begin the non-blocking send - mpi::TaggedISSend - (sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), - requestSendRequests_[rank][index]); - } + for ( Int rank = 0; rank < p; ++rank ) + { + const Int bufferSize = 4 * sizeof ( Int ); + const Int index = ReadyForSend ( bufferSize, requestVectors_[rank], + requestSendRequests_[rank], + sendingRequest_[rank] ); + // Copy the request header into the send buffer + byte* sendBuffer = requestVectors_[rank][index].data (); + byte* head = sendBuffer; + *reinterpret_cast ( head ) = i; + head += sizeof ( Int ); + *reinterpret_cast ( head ) = j; + head += sizeof ( Int ); + *reinterpret_cast ( head ) = height; + head += sizeof ( Int ); + *reinterpret_cast ( head ) = width; + head += sizeof ( Int ); + // Begin the non-blocking send + mpi::TaggedISSend + ( sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), + requestSendRequests_[rank][index] ); + } + // Receive all of the replies Int numReplies = 0; - while (numReplies < p) - { - HandleGlobalToLocalRequest (); - mpi::Status status; - - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - - // Ensure that we have a recv buffer - const Int count = mpi::GetCount < byte > (status); - recvVector_.resize (count); - byte *recvBuffer = recvVector_.data (); - - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm ()); - - // Unpack the reply header - const byte *head = recvBuffer; - const Int row = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const Int col = *reinterpret_cast < const Int * >(head); - head += sizeof (Int); - const T *recvData = reinterpret_cast < const T * >(head); - - // Compute the local heights and offsets - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (row, colAlign, r); - const Int rowShift = Shift (col, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Y.Buffer (0, rowShift + t * c); - const T *XCol = &recvData[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] += alpha * XCol[s]; - } - ++numReplies; - } - } - } - - template < typename T > - Int AxpyInterface < T >::ReadyForSend - (Int sendSize, - std::deque < std::vector < byte >> &sendVectors, - std::deque < mpi::Request > &requests, - std::deque < bool > &requestStatuses) - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::ReadyForSend")) - const Int numCreated = sendVectors.size (); - DEBUG_ONLY (if (numCreated != Int (requests.size ()) || - numCreated != - Int (requestStatuses.size ()))LogicError - ("size mismatch");) - for (Int i = 0; i < numCreated; ++i) - { - // If this request is still running, test to see if it finished. - if (requestStatuses[i]) - { - const bool finished = mpi::Test (requests[i]); - requestStatuses[i] = !finished; - } - - if (!requestStatuses[i]) - { - requestStatuses[i] = true; - sendVectors[i].resize (sendSize); - return i; - } - } - sendVectors.resize (numCreated + 1); - sendVectors[numCreated].resize (sendSize); - requests.push_back (mpi::REQUEST_NULL); - requestStatuses.push_back (true); - + + while ( numReplies < p ) + { + HandleGlobalToLocalRequest (); + mpi::Status status; + + if ( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm (), status ) ) + { + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount ( status ); + recvVector_.resize ( count ); + byte* recvBuffer = recvVector_.data (); + // Receive the data + mpi::TaggedRecv + ( recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm () ); + // Unpack the reply header + const byte* head = recvBuffer; + const Int row = *reinterpret_cast ( head ); + head += sizeof ( Int ); + const Int col = *reinterpret_cast ( head ); + head += sizeof ( Int ); + const T* recvData = reinterpret_cast ( head ); + // Compute the local heights and offsets + const Int colAlign = ( X.ColAlign () + i ) % r; + const Int rowAlign = ( X.RowAlign () + j ) % c; + const Int colShift = Shift ( row, colAlign, r ); + const Int rowShift = Shift ( col, rowAlign, c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + + // Unpack the local matrix + for ( Int t = 0; t < localWidth; ++t ) + { + T* YCol = Y.Buffer ( 0, rowShift + t * c ); + const T* XCol = &recvData[t * localHeight]; + + for ( Int s = 0; s < localHeight; ++s ) + YCol[colShift + s * r] += alpha * XCol[s]; + } + + ++numReplies; + } + } +} + +template +Int AxpyInterface ::ReadyForSend +( Int sendSize, + std::deque >& sendVectors, + std::deque & requests, + std::deque & requestStatuses ) +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::ReadyForSend" ) ) + const Int numCreated = sendVectors.size (); + + DEBUG_ONLY ( if ( numCreated != Int ( requests.size () ) || + numCreated != Int ( requestStatuses.size () ) ) + LogicError( "size mismatch" ); ) + + for ( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, test to see if it finished. + if ( requestStatuses[i] ) + { + const bool finished = mpi::Test ( requests[i] ); + requestStatuses[i] = !finished; + } + + if ( !requestStatuses[i] ) + { + requestStatuses[i] = true; + sendVectors[i].resize ( sendSize ); + return i; + } + } + + sendVectors.resize ( numCreated + 1 ); + sendVectors[numCreated].resize ( sendSize ); + requests.push_back ( mpi::REQUEST_NULL ); + requestStatuses.push_back ( true ); return numCreated; - } - +} + #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) -template < typename T > bool AxpyInterface < T >::ReturnRequestStatuses () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::ReturnRequestStatuses")) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); +template bool AxpyInterface ::ReturnRequestStatuses () +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::ReturnRequestStatuses" ) ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid () ); const Int p = g.Size (); - for (Int i = 0; i < p; ++i) + for ( Int i = 0; i < p; ++i ) { - const Int numDataSendRequests = dataSendRequests_[i].size (); - for (Int j = 0; j < numDataSendRequests; ++j) - { - if (sendingData_[i][j]) - sendingData_[i][j] = !mpi::Test (dataSendRequests_[i][j]); - if (sendingData_[i][j]) - return false; - } - const Int numRequestSendRequests = requestSendRequests_[i].size (); - for (Int j = 0; j < numRequestSendRequests; ++j) - { - if (sendingRequest_[i][j]) - sendingRequest_[i][j] = !mpi::Test (requestSendRequests_[i][j]); - if (sendingRequest_[i][j]) - return false; - } - const Int numReplySendRequests = replySendRequests_[i].size (); - for (Int j = 0; j < numReplySendRequests; ++j) - { - if (sendingReply_[i][j]) - sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); - if (sendingReply_[i][j]) - return false; - } + const Int numDataSendRequests = dataSendRequests_[i].size (); + + for ( Int j = 0; j < numDataSendRequests; ++j ) + { + if ( sendingData_[i][j] ) + sendingData_[i][j] = !mpi::Test ( dataSendRequests_[i][j] ); + + if ( sendingData_[i][j] ) + return false; + } + + const Int numRequestSendRequests = requestSendRequests_[i].size (); + + for ( Int j = 0; j < numRequestSendRequests; ++j ) + { + if ( sendingRequest_[i][j] ) + sendingRequest_[i][j] = !mpi::Test ( requestSendRequests_[i][j] ); + + if ( sendingRequest_[i][j] ) + return false; + } + + const Int numReplySendRequests = replySendRequests_[i].size (); + + for ( Int j = 0; j < numReplySendRequests; ++j ) + { + if ( sendingReply_[i][j] ) + sendingReply_[i][j] = !mpi::Test ( replySendRequests_[i][j] ); + + if ( sendingReply_[i][j] ) + return false; + } } + return true; - } -#else - template < typename T > void AxpyInterface < T >::UpdateRequestStatuses () - { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::UpdateRequestStatuses")) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid ()); +} +#else +template void AxpyInterface ::UpdateRequestStatuses () +{ + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::UpdateRequestStatuses" ) ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : + globalToLocalMat_->Grid () ); const Int p = g.Size (); - for (Int i = 0; i < p; ++i) - { - const Int numDataSendRequests = dataSendRequests_[i].size (); - for (Int j = 0; j < numDataSendRequests; ++j) - if (sendingData_[i][j]) - sendingData_[i][j] = !mpi::Test (dataSendRequests_[i][j]); - const Int numRequestSendRequests = requestSendRequests_[i].size (); - for (Int j = 0; j < numRequestSendRequests; ++j) - if (sendingRequest_[i][j]) - sendingRequest_[i][j] = !mpi::Test (requestSendRequests_[i][j]); - const Int numReplySendRequests = replySendRequests_[i].size (); - for (Int j = 0; j < numReplySendRequests; ++j) - if (sendingReply_[i][j]) - sendingReply_[i][j] = !mpi::Test (replySendRequests_[i][j]); - } - } -#endif + for ( Int i = 0; i < p; ++i ) + { + const Int numDataSendRequests = dataSendRequests_[i].size (); + + for ( Int j = 0; j < numDataSendRequests; ++j ) + if ( sendingData_[i][j] ) + sendingData_[i][j] = !mpi::Test ( dataSendRequests_[i][j] ); + + const Int numRequestSendRequests = requestSendRequests_[i].size (); + + for ( Int j = 0; j < numRequestSendRequests; ++j ) + if ( sendingRequest_[i][j] ) + sendingRequest_[i][j] = !mpi::Test ( requestSendRequests_[i][j] ); + + const Int numReplySendRequests = replySendRequests_[i].size (); + + for ( Int j = 0; j < numReplySendRequests; ++j ) + if ( sendingReply_[i][j] ) + sendingReply_[i][j] = !mpi::Test ( replySendRequests_[i][j] ); + } +} +#endif //EL_USE_IBARRIER_FOR_AXPY -template < typename T > void AxpyInterface < T >::Detach () +template void AxpyInterface ::Detach () { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::Detach")) - if (!attachedForLocalToGlobal_ && !attachedForGlobalToLocal_) - LogicError ("Must attach before detaching."); + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Detach" ) ) - const Grid & g = (attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : globalToLocalMat_-> - Grid ()); + if ( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) + LogicError ( "Must attach before detaching." ); + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid () : globalToLocalMat_-> + Grid () ); // nonblocking consensus #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) bool DONE = false; @@ -779,50 +802,46 @@ template < typename T > void AxpyInterface < T >::Detach () #else while( !Finished() ) #endif - { - if( attachedForLocalToGlobal_ ) - HandleLocalToGlobalData(); - else - HandleGlobalToLocalRequest(); + { + if( attachedForLocalToGlobal_ ) + HandleLocalToGlobalData(); + else + HandleGlobalToLocalRequest(); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) - if (nb_bar_active) - DONE = mpi::Test (nb_bar_request); - else - { - if ( ReturnRequestStatuses() ) - { - // all ssends are complete, start nonblocking barrier - mpi::IBarrier (g.VCComm (), nb_bar_request); - nb_bar_active = true; - } - } + if ( nb_bar_active ) + DONE = mpi::Test ( nb_bar_request ); + else + { + if ( ReturnRequestStatuses() ) + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier ( g.VCComm (), nb_bar_request ); + nb_bar_active = true; + } + } + #else - HandleEoms(); + HandleEoms(); #endif - } - - mpi::Barrier (g.VCComm ()); + } + mpi::Barrier ( g.VCComm () ); attachedForLocalToGlobal_ = false; attachedForGlobalToLocal_ = false; recvVector_.clear(); - #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else sentEomTo_.clear(); haveEomFrom_.clear(); eomSendRequests_.clear(); #endif - sendingData_.clear(); sendingRequest_.clear(); sendingReply_.clear(); - dataVectors_.clear(); requestVectors_.clear(); replyVectors_.clear(); - dataSendRequests_.clear(); requestSendRequests_.clear(); replySendRequests_.clear(); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index 6bf32ea4cc..ba27ec920d 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -8,25 +8,22 @@ which can be found in the LICENSE file in the root directory, or at namespace El { - template - AxpyInterface2::AxpyInterface2() - : GlobalArrayPut_(0), GlobalArrayGet_(0), - matrices_(0), coords_(0), dataVectors_(0), - toBeAttachedForGet_(false), toBeAttachedForPut_(false), - attached_(false), detached_(true) - { } +template +AxpyInterface2::AxpyInterface2() + : GlobalArrayPut_( 0 ), GlobalArrayGet_( 0 ), + matrices_( 0 ), coords_( 0 ), dataVectors_( 0 ), + toBeAttachedForGet_( false ), toBeAttachedForPut_( false ), + attached_( false ), detached_( true ) +{ } template AxpyInterface2::AxpyInterface2( DistMatrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::AxpyInterface2")) - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::AxpyInterface2" ) ) attached_ = false; detached_ = true; - toBeAttachedForGet_ = false; toBeAttachedForPut_ = false; - GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; } @@ -34,14 +31,11 @@ AxpyInterface2::AxpyInterface2( DistMatrix& Z ) template AxpyInterface2::AxpyInterface2( const DistMatrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::AxpyInterface2")) - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::AxpyInterface2" ) ) attached_ = false; detached_ = true; - toBeAttachedForGet_ = false; toBeAttachedForPut_ = false; - GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; } @@ -49,182 +43,181 @@ AxpyInterface2::AxpyInterface2( const DistMatrix& Z ) template AxpyInterface2::~AxpyInterface2() { - if( std::uncaught_exception() ) - { - std::ostringstream os; - os << "Uncaught exception detected during AxpyInterface2 destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << std::endl; - std::cerr << os.str(); - DEBUG_ONLY(DumpCallStack()) - } - else - { - Detach(); - } + if( std::uncaught_exception() ) + { + std::ostringstream os; + os << "Uncaught exception detected during AxpyInterface2 destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY( DumpCallStack() ) + } + else + Detach(); } template Int AxpyInterface2::NextIndexData ( - Int target, - Int dataSize, - const void* base_address, - Int *mindex) + Int target, + Int dataSize, + const void* base_address, + Int* mindex ) { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexData")) - + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface2::NextIndexData" ) ) assert ( base_address != NULL ); - Int matrixIndex = 0; const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); const Int numMatrices = matrices_.size(); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - // uninitiated, first time - if ( matrices_[m].base_ == NULL ) - { - matrices_[m].base_ = base_address; - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + // uninitiated, first time + if ( matrices_[m].base_ == NULL ) + { + matrices_[m].base_ = base_address; + matrixIndex = m; + break; + } + + matrixIndex = m+1; } - + // need to create new object - if ( matrixIndex == numMatrices) + if ( matrixIndex == numMatrices ) { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - matrices_[matrixIndex].base_ = base_address; + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + matrices_[matrixIndex].base_ = base_address; } - // go through the request, data, + + // go through the request, data, // status objects const Int numCreated = matrices_[matrixIndex].data_[target].size (); - DEBUG_ONLY (if (numCreated != Int (matrices_[matrixIndex].requests_[target].size ()) || - numCreated != Int (matrices_[matrixIndex].statuses_[target].size ())) - LogicError ("size mismatch");) - - for (Int i = 0; i < numCreated; ++i) - { - // If this request is still running, - // test to see if it finished. - if (matrices_[matrixIndex].statuses_[target][i]) - { - const bool finished = mpi::Test (matrices_[matrixIndex].requests_[target][i]); - matrices_[matrixIndex].statuses_[target][i] = !finished; - } - - if (!matrices_[matrixIndex].statuses_[target][i]) - { - matrices_[matrixIndex].statuses_[target][i] = true; - matrices_[matrixIndex].data_[target][i].resize ( dataSize ); - *mindex = matrixIndex; - return i; - } - } + + DEBUG_ONLY ( if ( numCreated != Int ( matrices_[matrixIndex].requests_[target].size () ) + || numCreated != Int ( matrices_[matrixIndex].statuses_[target].size () ) ) + LogicError ( "size mismatch" ); ) + for ( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, + // test to see if it finished. + if ( matrices_[matrixIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test ( matrices_[matrixIndex].requests_[target][i] ); + matrices_[matrixIndex].statuses_[target][i] = !finished; + } + + if ( !matrices_[matrixIndex].statuses_[target][i] ) + { + matrices_[matrixIndex].statuses_[target][i] = true; + matrices_[matrixIndex].data_[target][i].resize ( dataSize ); + *mindex = matrixIndex; + return i; + } + } matrices_[matrixIndex].data_[target].resize ( numCreated + 1 ); matrices_[matrixIndex].data_[target][numCreated].resize ( dataSize ); matrices_[matrixIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); matrices_[matrixIndex].statuses_[target].push_back ( true ); *mindex = matrixIndex; - return numCreated; } template Int AxpyInterface2::NextIndexCoord ( - Int i, Int j, - Int target, - const void* base_address, - Int *cindex) + Int i, Int j, + Int target, + const void* base_address, + Int* cindex ) { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface2::NextIndexCoord")) - + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface2::NextIndexCoord" ) ) assert ( base_address != NULL ); - Int coordIndex = 0; const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); const Int numCoords = coords_.size(); // search for matrix base - for (Int m = 0; m < numCoords; m++) + for ( Int m = 0; m < numCoords; m++ ) { - if ( coords_[m].base_ == base_address ) - { - coordIndex = m; - break; - } - if ( coords_[m].base_ == NULL ) - { - coords_[m].base_ = base_address; - coordIndex = m; - break; - } - coordIndex = m+1; + if ( coords_[m].base_ == base_address ) + { + coordIndex = m; + break; + } + + if ( coords_[m].base_ == NULL ) + { + coords_[m].base_ = base_address; + coordIndex = m; + break; + } + + coordIndex = m+1; } - + // need to create new object if ( coordIndex == numCoords ) { - struct coord_params_ cp; - cp.coord_.resize(p); - cp.requests_.resize(p); - cp.statuses_.resize(p); - cp.base_ = NULL; - // push back new matrix_params created - // with default constructor - coords_.push_back( cp ); - coords_[coordIndex].base_ = base_address; + struct coord_params_ cp; + cp.coord_.resize( p ); + cp.requests_.resize( p ); + cp.statuses_.resize( p ); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + coords_[coordIndex].base_ = base_address; } - // go through the request, data, + + // go through the request, data, // status objects const Int numCreated = coords_[coordIndex].coord_[target].size (); - DEBUG_ONLY (if (numCreated != Int (coords_[coordIndex].requests_[target].size ()) || - numCreated != Int (matrices_[coordIndex].statuses_[target].size ())) - LogicError ("size mismatch");) - - for (Int i = 0; i < numCreated; ++i) - { - // If this request is still running, - // test to see if it finished. - if (coords_[coordIndex].statuses_[target][i]) - { - const bool finished = mpi::Test (coords_[coordIndex].requests_[target][i]); - coords_[coordIndex].statuses_[target][i] = !finished; - } - - if (!coords_[coordIndex].statuses_[target][i]) - { - coords_[coordIndex].statuses_[target][i] = true; - coords_[coordIndex].coord_[target][i][0] = i; - coords_[coordIndex].coord_[target][i][1] = j; - *cindex = coordIndex; - return i; - } - } + + DEBUG_ONLY ( if ( numCreated != Int ( coords_[coordIndex].requests_[target].size () ) + || numCreated != Int ( matrices_[coordIndex].statuses_[target].size () ) ) + LogicError ( "size mismatch" ); ) + for ( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, + // test to see if it finished. + if ( coords_[coordIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test ( coords_[coordIndex].requests_[target][i] ); + coords_[coordIndex].statuses_[target][i] = !finished; + } + + if ( !coords_[coordIndex].statuses_[target][i] ) + { + coords_[coordIndex].statuses_[target][i] = true; + coords_[coordIndex].coord_[target][i][0] = i; + coords_[coordIndex].coord_[target][i][1] = j; + *cindex = coordIndex; + return i; + } + } coords_[coordIndex].coord_[target].resize ( numCreated + 1 ); coords_[coordIndex].coord_[target][numCreated][0] = i; @@ -232,337 +225,311 @@ Int AxpyInterface2::NextIndexCoord ( coords_[coordIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); coords_[coordIndex].statuses_[target].push_back ( true ); *cindex = coordIndex; - return numCreated; } template void AxpyInterface2::Attach( DistMatrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Attach")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Attach" ) ) + // attached_ will be only set in Attach // and only unset in Detach - if (!attached_ && detached_) + if ( !attached_ && detached_ ) { - attached_ = true; - detached_ = false; + attached_ = true; + detached_ = false; } else - LogicError("Must detach before reattaching."); + LogicError( "Must detach before reattaching." ); const Grid& g = Z.Grid(); const Int p = g.Size (); - - // the matrix base_ is not known until + + // the matrix base_ is not known until // an update operation (put/get/acc) // so it is kept blank // if DistMatrix is non-const, all one-sided // transfers -- put, get and acc are possible if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) { - GlobalArrayPut_ = &Z; - toBeAttachedForPut_ = true; - GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; - - if ( dataVectors_.empty() ) - dataVectors_.resize(p); - - if ( matrices_.empty() ) - { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - } - - if ( coords_.empty() ) - { - struct coord_params_ cp; - cp.coord_.resize(p); - cp.requests_.resize(p); - cp.statuses_.resize(p); - cp.base_ = NULL; - // push back new matrix_params created - // with default constructor - coords_.push_back( cp ); - } + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + + if ( dataVectors_.empty() ) + dataVectors_.resize( p ); + + if ( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if ( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize( p ); + cp.requests_.resize( p ); + cp.statuses_.resize( p ); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + } } - mpi::Barrier (g.VCComm()); + mpi::Barrier ( g.VCComm() ); } template void AxpyInterface2::Attach( const DistMatrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Attach")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Attach" ) ) + // attached_ will be only set in Attach // and only unset in Detach - if (!attached_ && detached_) + if ( !attached_ && detached_ ) { - attached_ = true; - detached_ = false; + attached_ = true; + detached_ = false; } else - LogicError("Must detach before reattaching."); + LogicError( "Must detach before reattaching." ); const Grid& g = Z.Grid(); const Int p = g.Size (); - - // the matrix base_ is not known until + + // the matrix base_ is not known until // an update operation (put/get/acc) // so it is kept blank // if DistMatrix is non-const, all one-sided // transfers -- put, get and acc are possible if( !toBeAttachedForGet_ ) { - GlobalArrayPut_ = 0; - toBeAttachedForPut_ = false; - GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; - - if ( matrices_.empty() ) - { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - } - - if ( coords_.empty() ) - { - struct coord_params_ cp; - cp.coord_.resize(p); - cp.requests_.resize(p); - cp.statuses_.resize(p); - cp.base_ = NULL; - // push back new matrix_params created - // with default constructor - coords_.push_back( cp ); - } + GlobalArrayPut_ = 0; + toBeAttachedForPut_ = false; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + + if ( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if ( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize( p ); + cp.requests_.resize( p ); + cp.statuses_.resize( p ); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + } } - mpi::Barrier (g.VCComm()); + mpi::Barrier ( g.VCComm() ); } // end-to-end blocking put/acc routines template void AxpyInterface2::Eput( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eput")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Eput" ) ) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); - const Int XLDim = Z.LDim(); - const Int height = Z.Height(); const Int width = Z.Width(); - const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); - const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const T* XBuffer = Z.LockedBuffer(); - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); - + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int YLDim = Y.LDim(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int YLDim = Y.LDim(); Int matrix_index, coord_index; - + // data/coord send for( Int step=0; step 0 ) - { - const Int destination = receivingRow + r*receivingCol; - // data - const Int dindex = - NextIndexData (destination, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord_[0] = i; - coord_[1] = j; - coord_[2] = numEntries; - - // post receive for coordinates - mpi::TaggedISend (coord_, 3, destination, - COORD_PUT_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); - } - - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + const Int colShift = Shift( receivingRow, colAlign, r ); + const Int rowShift = Shift( receivingCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int numEntries = localHeight * localWidth; + + if( numEntries > 0 ) + { + const Int destination = receivingRow + r*receivingCol; + // data + const Int dindex = + NextIndexData ( destination, + numEntries, + Buffer, + &matrix_index ); + + DEBUG_ONLY ( if + ( Int ( matrices_[matrix_index].data_[destination][dindex].size () ) != + numEntries ) LogicError ( "Error in NextIndexData" ); ) + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + + for( Int t=0; t( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + // post receive for coordinates + mpi::TaggedISend ( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; } // poke - Test (Z); - + Test ( Z ); // data/coord receive std::vector recvVector_; - for (Int step=0; step void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) { - Eput( const_cast&>(Z), i, j ); + Eput( const_cast&>( Z ), i, j ); } // end to end blocking routines template void AxpyInterface2::Eacc( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Eacc")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Eacc" ) ) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); - const Int XLDim = Z.LDim(); - const Int height = Z.Height(); const Int width = Z.Width(); - const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); - const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const T* XBuffer = Z.LockedBuffer(); - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); - + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int YLDim = Y.LDim(); - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int YLDim = Y.LDim(); // data/coord receive std::vector recvVector_; Int matrix_index, coord_index; @@ -570,232 +537,223 @@ void AxpyInterface2::Eacc( const Matrix& Z, Int i, Int j ) // data/coord send for( Int step=0; step 0 ) - { - const Int destination = receivingRow + r*receivingCol; - // data - const Int dindex = - NextIndexData (destination, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord_[0] = i; - coord_[1] = j; - coord_[2] = numEntries; - - // post receive for coordinates - mpi::TaggedISend (coord_, 3, destination, - COORD_ACC_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); - } - - receivingRow = (receivingRow + 1) % r; - if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + const Int colShift = Shift( receivingRow, colAlign, r ); + const Int rowShift = Shift( receivingCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int numEntries = localHeight * localWidth; + + if( numEntries > 0 ) + { + const Int destination = receivingRow + r*receivingCol; + // data + const Int dindex = + NextIndexData ( destination, + numEntries, + Buffer, + &matrix_index ); + + DEBUG_ONLY ( if + ( Int ( matrices_[matrix_index].data_[destination][dindex].size () ) != + numEntries ) LogicError ( "Error in NextIndexData" ); ) + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + + for( Int t=0; t( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + // post receive for coordinates + mpi::TaggedISend ( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; } // test for requests - Test (Z); + Test ( Z ); - for (Int step=0; step void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) { - Eacc( const_cast&>(Z), i, j ); + Eacc( const_cast&>( Z ), i, j ); } template void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Get")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Get" ) ) + // a call to Attach with a non-const DistMatrix must set // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - const DistMatrix& X = *GlobalArrayGet_; + LogicError ( "Cannot perform this operation as matrix is not attached." ); + const DistMatrix& X = *GlobalArrayGet_; const Int height = Z.Height (); const Int width = Z.Width (); - - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid submatrix for Iget"); - T* XBuffer = Z.Buffer(); - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + if ( i + height > X.Height () || j + width > X.Width () ) + LogicError ( "Invalid submatrix for Iget" ); - const Grid & g = X.Grid (); + T* XBuffer = Z.Buffer(); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + const Grid& g = X.Grid (); const Int p = g.Size (); const Int r = g.Height (); const Int c = g.Width (); - Int coord_index; std::vector recvVector_; // Send out the requests to all processes in the grid - for (Int rank = 0; rank < p; ++rank) + for ( Int rank = 0; rank < p; ++rank ) { - const Int cindex = - NextIndexCoord (i, j, - rank, - Buffer, - &coord_index); - - Int *coord = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data ()); - coord[0] = i; - coord[1] = j; + const Int cindex = + NextIndexCoord ( i, j, + rank, + Buffer, + &coord_index ); + Int* coord = reinterpret_cast( coords_[coord_index].coord_[rank][cindex].data () ); + coord[0] = i; + coord[1] = j; coord[2] = -1; - - mpi::TaggedISend (coord, 3, rank, - REQUEST_GET_TAG, g.VCComm (), - coords_[coord_index].requests_[rank][cindex]); + mpi::TaggedISend ( coord, 3, rank, + REQUEST_GET_TAG, g.VCComm (), + coords_[coord_index].requests_[rank][cindex] ); } - + // Receive all of the replies Int numReplies = 0; - while (numReplies < p) + + while ( numReplies < p ) { - mpi::Status status; - HandleGlobalToLocalData ( Z ); - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount (status); - recvVector_.resize (count); - T *recvBuffer = recvVector_.data (); - - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); - - // Compute the local heights and offsets - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int colAlign = (X.ColAlign () + i) % r; - const Int rowAlign = (X.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) - { - //T *YCol = X.Buffer (0, rowShift + t * c); - T *YCol = Z.Buffer (0, rowShift + t * c); - const T *XCol = &recvBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] = XCol[s]; - } - ++numReplies; - recvVector_.clear(); - } + mpi::Status status; + HandleGlobalToLocalData ( Z ); + + if ( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status ) ) + { + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount ( status ); + recvVector_.resize ( count ); + T* recvBuffer = recvVector_.data (); + // Receive the data + mpi::TaggedRecv + ( recvBuffer, count, source, DATA_GET_TAG, g.VCComm () ); + // Compute the local heights and offsets + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int colAlign = ( X.ColAlign () + i ) % r; + const Int rowAlign = ( X.RowAlign () + j ) % c; + const Int colShift = Shift ( myRow, colAlign, r ); + const Int rowShift = Shift ( myCol, rowAlign, c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + + // Unpack the local matrix + for ( Int t = 0; t < localWidth; ++t ) + { + //T *YCol = X.Buffer (0, rowShift + t * c); + T* YCol = Z.Buffer ( 0, rowShift + t * c ); + const T* XCol = &recvBuffer[t * localHeight]; + + for ( Int s = 0; s < localHeight; ++s ) + YCol[colShift + s * r] = XCol[s]; + } + + ++numReplies; + recvVector_.clear(); + } } } -// nonblocking, no local completion +// nonblocking, no local completion template void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iput")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Iput" ) ) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -803,23 +761,19 @@ void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); Int matrix_index, coord_index; // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int YLDim = Y.LDim (); - const T* XBuffer = Z.LockedBuffer(); - const void* Buffer = static_cast< void * >(const_cast< T * >(Z.LockedBuffer())); - + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + for( Int step=0; step::Iput( const Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - const Int dindex = - NextIndexData (destination, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord_[0] = i; - coord_[1] = j; - coord_[2] = numEntries; - - // post receive for coordinates - mpi::TaggedISend (coord_, 3, destination, - COORD_PUT_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); - } - - receivingRow = (receivingRow + 1) % r; + + // put request + mpi::TaggedISend ( sendBuffer, numEntries, destination, + DATA_PUT_TAG, g.VCComm (), + matrices_[matrix_index].requests_[destination][dindex] ); + // send coordinates + const Int cindex = + NextIndexCoord ( i, j, + destination, + Buffer, + &coord_index ); + Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + // post receive for coordinates + mpi::TaggedISend ( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + receivingCol = ( receivingCol + 1 ) % c; } } template void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) { - Iput( const_cast&>(Z), i, j ); + Iput( const_cast&>( Z ), i, j ); } template void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iget")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Iget" ) ) + // a call to Attach with a non-const DistMatrix must set // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); - const DistMatrix& X = *GlobalArrayGet_; + LogicError ( "Cannot perform this operation as matrix is not attached." ); + const DistMatrix& X = *GlobalArrayGet_; const Int height = Z.Height (); const Int width = Z.Width (); - - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); - + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); Int coord_index; - - if (i + height > X.Height () || j + width > X.Width ()) - LogicError ("Invalid submatrix for Iget"); - const Grid & g = X.Grid (); + if ( i + height > X.Height () || j + width > X.Width () ) + LogicError ( "Invalid submatrix for Iget" ); + + const Grid& g = X.Grid (); const Int p = g.Size (); // Send out the requests to all processes in the grid - for (Int rank = 0; rank < p; ++rank) + for ( Int rank = 0; rank < p; ++rank ) { - // send coordinates - const Int cindex = - NextIndexCoord (i, j, - rank, - Buffer, - &coord_index); - - Int *coord_ = reinterpret_cast(coords_[coord_index].coord_[rank][cindex].data()); - coord_[0] = i; - coord_[1] = j; - coord_[2] = -1; - - // post receive for coordinates - mpi::TaggedISend (coord_, 3, rank, - REQUEST_GET_TAG, g.VCComm(), - coords_[coord_index].requests_[rank][cindex]); + // send coordinates + const Int cindex = + NextIndexCoord ( i, j, + rank, + Buffer, + &coord_index ); + Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[rank][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = -1; + // post receive for coordinates + mpi::TaggedISend ( coord_, 3, rank, + REQUEST_GET_TAG, g.VCComm(), + coords_[coord_index].requests_[rank][cindex] ); } } @@ -939,19 +887,20 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) template void AxpyInterface2::Iacc( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Iacc")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Iacc" ) ) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; Int matrix_index, coord_index; - + //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -959,21 +908,17 @@ void AxpyInterface2::Iacc( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - - const Int YLDim = Y.LDim(); - + const Int YLDim = Y.LDim(); const T* XBuffer = Z.LockedBuffer(); - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); for( Int step=0; step::Iacc( const Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int dindex = - NextIndexData (destination, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord_[0] = i; - coord_[1] = j; - coord_[2] = numEntries; - - mpi::TaggedISend (coord_, 3, destination, - COORD_ACC_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); - } - - receivingRow = (receivingRow + 1) % r; + + // acc request + mpi::TaggedISend ( sendBuffer, numEntries, destination, + DATA_ACC_TAG, g.VCComm (), + matrices_[matrix_index].requests_[destination][dindex] ); + // send coordinates + const Int cindex = + NextIndexCoord ( i, j, + destination, + Buffer, + &coord_index ); + Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + mpi::TaggedISend ( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + receivingCol = ( receivingCol + 1 ) % c; } } template void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) { - Iacc( const_cast&>(Z), i, j ); + Iacc( const_cast&>( Z ), i, j ); } -// nonblocking, local completion +// nonblocking, local completion template void AxpyInterface2::Put( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Put")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Put" ) ) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; + //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -1062,33 +1008,26 @@ void AxpyInterface2::Put( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); Int matrix_index, coord_index; // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int YLDim = Y.LDim (); - - // copy local matrix buffer + // copy local matrix buffer const Int my_rank = g.VCRank(); const Int numCreated = dataVectors_[my_rank].size (); - - dataVectors_[my_rank].resize (numCreated + 1); - dataVectors_[my_rank][numCreated].resize (width * height); - - const void* Buffer = static_cast < void * >(const_cast < T * >(Z.LockedBuffer())); - T* ZBuffer = reinterpret_cast < T * >(dataVectors_[my_rank][numCreated].data()); - - MemCopy (ZBuffer, reinterpret_cast < const T * >(Buffer), - height * width); - T* XBuffer = reinterpret_cast < T * >(ZBuffer); + dataVectors_[my_rank].resize ( numCreated + 1 ); + dataVectors_[my_rank][numCreated].resize ( width * height ); + const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); + T* ZBuffer = reinterpret_cast ( dataVectors_[my_rank][numCreated].data() ); + MemCopy ( ZBuffer, reinterpret_cast ( Buffer ), + height * width ); + T* XBuffer = reinterpret_cast ( ZBuffer ); for( Int step=0; step::Put( const Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - // data - const Int dindex = - NextIndexData (destination, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord_[0] = i; - coord_[1] = j; - coord_[2] = numEntries; - - mpi::TaggedISend (coord_, 3, destination, - COORD_PUT_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); - } - - receivingRow = (receivingRow + 1) % r; + + mpi::TaggedISend ( sendBuffer, numEntries, destination, + DATA_PUT_TAG, g.VCComm (), + matrices_[matrix_index].requests_[destination][dindex] ); + // send coordinates + const Int cindex = + NextIndexCoord ( i, j, + destination, + Buffer, + &coord_index ); + Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + mpi::TaggedISend ( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + receivingCol = ( receivingCol + 1 ) % c; } } template void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) { - Put( const_cast&>(Z), i, j ); + Put( const_cast&>( Z ), i, j ); } // input buffer could be modified upon exit @@ -1162,19 +1098,20 @@ void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) template void AxpyInterface2::Acc( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Acc")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Acc" ) ) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; Int matrix_index, coord_index; - + //do boundary checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -1182,33 +1119,26 @@ void AxpyInterface2::Acc( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - - const Int YLDim = Y.LDim(); - - // copy local matrix buffer + const Int YLDim = Y.LDim(); + // copy local matrix buffer const Int my_rank = g.VCRank(); const Int numCreated = dataVectors_[my_rank].size (); + dataVectors_[my_rank].resize ( numCreated + 1 ); + dataVectors_[my_rank][numCreated].resize ( width * height ); + const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); + T* ZBuffer = reinterpret_cast ( dataVectors_[my_rank][numCreated].data() ); + MemCopy ( ZBuffer, reinterpret_cast ( Buffer ), + height * width ); + T* XBuffer = reinterpret_cast ( ZBuffer ); - dataVectors_[my_rank].resize (numCreated + 1); - dataVectors_[my_rank][numCreated].resize (width * height); - - const void* Buffer = static_cast < void * >(const_cast < T * >(Z.LockedBuffer())); - T* ZBuffer = reinterpret_cast < T * >(dataVectors_[my_rank][numCreated].data()); - - MemCopy (ZBuffer, reinterpret_cast < const T * >(Buffer), - height * width); - T* XBuffer = reinterpret_cast < T * >(ZBuffer); - for( Int step=0; step::Acc( const Matrix& Z, Int i, Int j ) const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - - // data - const Int dindex = - NextIndexData (destination, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][dindex].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); - for( Int t=0; t(coords_[coord_index].coord_[destination][cindex].data()); - coord_[0] = i; - coord_[1] = j; - coord_[2] = numEntries; - - mpi::TaggedISend (coord_, 3, destination, - COORD_ACC_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex]); - } - - receivingRow = (receivingRow + 1) % r; + + // acc request + mpi::TaggedISend ( sendBuffer, numEntries, destination, + DATA_ACC_TAG, g.VCComm (), + matrices_[matrix_index].requests_[destination][dindex] ); + // send coordinates + const Int cindex = + NextIndexCoord ( i, j, + destination, + Buffer, + &coord_index ); + Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + mpi::TaggedISend ( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + if( receivingRow == 0 ) - receivingCol = (receivingCol + 1) % c; + receivingCol = ( receivingCol + 1 ) % c; } } template void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) { - Acc( const_cast&>(Z), i, j ); + Acc( const_cast&>( Z ), i, j ); } // waitany implementation // cannot use mpi::Waitany -// as of now because request +// as of now because request // objects are vector of deques template void AxpyInterface2::WaitAny( const Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::WaitAny")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::WaitAny" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; - const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - - const void* base_address = static_cast(const_cast(Z.LockedBuffer())); - + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } // search for matrix base in coords - for (Int c = 0; c < numCoords; c++) + for ( Int c = 0; c < numCoords; c++ ) { - if ( coords_[c].base_ == base_address ) - { - coordIndex = c; - break; - } - coordIndex = c+1; + if ( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + + coordIndex = c+1; } // matrix not found if ( matrixIndex == numMatrices && - coordIndex == numCoords) - return; + coordIndex == numCoords ) + return; // data - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - if (!matrices_[matrixIndex].statuses_[rank][i]) - { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); - matrices_[matrixIndex].statuses_[rank][i] = true; - return; - } - } + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + if ( !matrices_[matrixIndex].statuses_[rank][i] ) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + return; + } + } } - + // coordinates - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); - - for (int i = 0; i < numCoordStatuses; i++) - { - if (!coords_[coordIndex].statuses_[rank][i]) - { - mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); - coords_[coordIndex].statuses_[rank][i] = true; - return; - } - } + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for ( int i = 0; i < numCoordStatuses; i++ ) + { + if ( !coords_[coordIndex].statuses_[rank][i] ) + { + mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; + return; + } + } } } template void AxpyInterface2::WaitAny( Matrix& Z ) { - WaitAny( const_cast&>(Z) ); + WaitAny( const_cast&>( Z ) ); } template void AxpyInterface2::Wait( const Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Wait")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Wait" ) ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); - + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; - const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - - const void* base_address = static_cast(const_cast(Z.LockedBuffer())); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } // search for matrix base in coords - for (Int c = 0; c < numCoords; c++) + for ( Int c = 0; c < numCoords; c++ ) { - if ( coords_[c].base_ == base_address ) - { - coordIndex = c; - break; - } - coordIndex = c+1; + if ( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + + coordIndex = c+1; } // matrix not found if ( matrixIndex == numMatrices && - coordIndex == numCoords) - return; + coordIndex == numCoords ) + return; // data - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); - matrices_[matrixIndex].statuses_[rank][i] = true; - } + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } } - + // coordinates - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); - - for (int i = 0; i < numCoordStatuses; i++) - { - mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); - coords_[coordIndex].statuses_[rank][i] = true; - } + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for ( int i = 0; i < numCoordStatuses; i++ ) + { + mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; + } } } template void AxpyInterface2::Wait( Matrix& Z ) { - Wait( const_cast&>(Z) ); + Wait( const_cast&>( Z ) ); } template void AxpyInterface2::Waitall () { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Waitall")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Waitall" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; - const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); // data - for (int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex) + for ( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) { - for (int rank = 0; rank < p; ++rank) - { - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); - matrices_[matrixIndex].statuses_[rank][i] = true; - } - } + for ( int rank = 0; rank < p; ++rank ) + { + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } } // coordinates - for (int coordIndex = 0; coordIndex < numCoords; ++coordIndex) + for ( int coordIndex = 0; coordIndex < numCoords; ++coordIndex ) { - for (int rank = 0; rank < p; ++rank) - { - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); - - for (int i = 0; i < numCoordStatuses; i++) - { - mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); - coords_[coordIndex].statuses_[rank][i] = true; - } - } + for ( int rank = 0; rank < p; ++rank ) + { + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for ( int i = 0; i < numCoordStatuses; i++ ) + { + mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; + } + } } } template bool AxpyInterface2::Test( const Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Test")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Test" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; - const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - - const void* base_address = static_cast(const_cast(Z.LockedBuffer())); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } + // search for matrix base in coords - for (Int c = 0; c < numCoords; c++) + for ( Int c = 0; c < numCoords; c++ ) { - if ( coords_[c].base_ == base_address ) - { - coordIndex = c; - break; - } - coordIndex = c+1; + if ( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + + coordIndex = c+1; } // matrix not found - if ( matrixIndex == numMatrices && - coordIndex == numCoords) - return true; + if ( matrixIndex == numMatrices && + coordIndex == numCoords ) + return true; - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); - if (matrices_[matrixIndex].statuses_[rank][i]) - return false; - } + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + + if ( matrices_[matrixIndex].statuses_[rank][i] ) + return false; + } } - - for (int rank = 0; rank < p; ++rank) + + for ( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); - - for (int i = 0; i < numCoordStatuses; i++) - { - coords_[coordIndex].statuses_[rank][i] = - !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); - if (coords_[coordIndex].statuses_[rank][i]) - return false; - } + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for ( int i = 0; i < numCoordStatuses; i++ ) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + + if ( coords_[coordIndex].statuses_[rank][i] ) + return false; + } } return true; @@ -1581,151 +1515,157 @@ bool AxpyInterface2::Test( const Matrix& Z ) template bool AxpyInterface2::Test( Matrix& Z ) { - return Test( const_cast&>(Z) ); + return Test( const_cast&>( Z ) ); } template bool AxpyInterface2::TestAny( const Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::TestAny")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::TestAny" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex, coordIndex; - const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - - const void* base_address = static_cast(const_cast(Z.LockedBuffer())); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } + // search for matrix base in coords - for (Int c = 0; c < numCoords; c++) + for ( Int c = 0; c < numCoords; c++ ) { - if ( coords_[c].base_ == base_address ) - { - coordIndex = c; - break; - } - coordIndex = c+1; + if ( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + + coordIndex = c+1; } // matrix not found - if ( matrixIndex == numMatrices && - coordIndex == numCoords) - return true; + if ( matrixIndex == numMatrices && + coordIndex == numCoords ) + return true; - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); - if (matrices_[matrixIndex].statuses_[rank][i]) - continue; - else - return true; - } + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + + if ( matrices_[matrixIndex].statuses_[rank][i] ) + continue; + else + return true; + } } - - for (int rank = 0; rank < p; ++rank) + + for ( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); - - for (int i = 0; i < numCoordStatuses; i++) - { - coords_[coordIndex].statuses_[rank][i] = - !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); - if (coords_[coordIndex].statuses_[rank][i]) - continue; - else - return true; - } + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for ( int i = 0; i < numCoordStatuses; i++ ) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + + if ( coords_[coordIndex].statuses_[rank][i] ) + continue; + else + return true; + } } - + return false; } template bool AxpyInterface2::TestAny( Matrix& Z ) { - return TestAny( const_cast&>(Z) ); + return TestAny( const_cast&>( Z ) ); } template bool AxpyInterface2::Testall() { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Testall")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Testall" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); - const Int numMatrices = matrices_.size(); const Int numCoords = coords_.size(); - + // data - for (int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex) - { - for (int rank = 0; rank < p; ++rank) - { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); - if (matrices_[matrixIndex].statuses_[rank][i]) - return false; - } - } + for ( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + { + for ( int rank = 0; rank < p; ++rank ) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + + if ( matrices_[matrixIndex].statuses_[rank][i] ) + return false; + } + } } // coordinates - for (int coordIndex = 0; coordIndex < numCoords; ++coordIndex) - { - for (int rank = 0; rank < p; ++rank) - { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); - - for (int i = 0; i < numCoordStatuses; i++) - { - coords_[coordIndex].statuses_[rank][i] = - !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); - if (coords_[coordIndex].statuses_[rank][i]) - return false; - } - } + for ( int coordIndex = 0; coordIndex < numCoords; ++coordIndex ) + { + for ( int rank = 0; rank < p; ++rank ) + { + if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + + for ( int i = 0; i < numCoordStatuses; i++ ) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + + if ( coords_[coordIndex].statuses_[rank][i] ) + return false; + } + } } return true; @@ -1737,139 +1677,138 @@ bool AxpyInterface2::Testall() template void AxpyInterface2::Flush( const Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Flush" ) ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); + LogicError( "Must initiate transfer before flushing." ); const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); - + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); bool DONE = false; - mpi::Status status; + mpi::Status status; while ( !DONE ) - { - if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) - { - switch (status.MPI_TAG) - { - case DATA_PUT_TAG: - { - HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); - break; - } - case DATA_ACC_TAG: - { - HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); - break; - } - } - } - // wait for requests to - // complete one by one - WaitAny (Z); - DONE = Test (Z); + { + if ( mpi::IProbe ( mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status ) ) + { + switch ( status.MPI_TAG ) + { + case DATA_PUT_TAG: + { + HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); + break; + } + + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); + break; + } + } + } + + // wait for requests to + // complete one by one + WaitAny ( Z ); + DONE = Test ( Z ); } } template void AxpyInterface2::Flush( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Flush")) - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Flush" ) ) + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); + LogicError( "Must initiate transfer before flushing." ); const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); - + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); bool DONE = false; - mpi::Status status; + mpi::Status status; while ( !DONE ) - { - if ( mpi::IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status) ) - { - switch (status.MPI_TAG) - { - case DATA_PUT_TAG: - { - HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); - break; - } - case DATA_ACC_TAG: - { - HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); - break; - } - case REQUEST_GET_TAG: - { - HandleGlobalToLocalData ( Z ); - break; - } - } - } - // wait for requests to - // complete one by one - WaitAny (Z); - DONE = Test (Z); + { + if ( mpi::IProbe ( mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status ) ) + { + switch ( status.MPI_TAG ) + { + case DATA_PUT_TAG: + { + HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); + break; + } + + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); + break; + } + + case REQUEST_GET_TAG: + { + HandleGlobalToLocalData ( Z ); + break; + } + } + } + + // wait for requests to + // complete one by one + WaitAny ( Z ); + DONE = Test ( Z ); } } -template < typename T > +template void AxpyInterface2::HandleLocalToGlobalData ( const Matrix& Z, Int source ) { - DistMatrix &Y = *GlobalArrayPut_; - const Grid & g = Y.Grid (); + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int myRow = g.Row (); const Int myCol = g.Col (); int height = Z.Height(); int width = Z.Width(); - // post receive for coordinates Int coord[3]; - mpi::TaggedRecv (coord, 3, source, - COORD_PUT_TAG, g.VCComm()); - Int i = coord[0]; + mpi::TaggedRecv ( coord, 3, source, + COORD_PUT_TAG, g.VCComm() ); + Int i = coord[0]; Int j = coord[1]; Int count = coord[2]; - // data vector std::vector getVector_; - getVector_.resize (count); - - DEBUG_ONLY (if (count < Int (sizeof (T))) - LogicError ("Count was too small");) - DEBUG_ONLY (if (Int (getVector_.size ()) != count) - LogicError ("Not enough space allocated");) + getVector_.resize ( count ); - // post receive for data - T *getBuffer = getVector_.data(); - mpi::TaggedRecv (getBuffer, count, source, - DATA_PUT_TAG, g.VCComm()); + DEBUG_ONLY ( if ( count < Int ( sizeof ( T ) ) ) + LogicError ( "Count was too small" ); ) + DEBUG_ONLY ( if ( Int ( getVector_.size () ) != count ) + LogicError ( "Not enough space allocated" ); ) + // post receive for data + T* getBuffer = getVector_.data(); + mpi::TaggedRecv ( getBuffer, count, source, + DATA_PUT_TAG, g.VCComm() ); // Update Y - const T *XBuffer = const_cast < const T * >(getBuffer); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift(), r); - const Int jLocalOffset = Length (j, Y.RowShift(), c); - - for (Int t = 0; t < localWidth; ++t) + const T* XBuffer = const_cast ( getBuffer ); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift ( myRow, colAlign, r ); + const Int rowShift = Shift ( myCol, rowAlign, c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + const Int iLocalOffset = Length ( i, Y.ColShift(), r ); + const Int jLocalOffset = Length ( j, Y.RowShift(), c ); + + for ( Int t = 0; t < localWidth; ++t ) { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - MemCopy (YCol, XCol, localHeight); + T* YCol = Y.Buffer ( iLocalOffset, jLocalOffset + t ); + const T* XCol = &XBuffer[t * localHeight]; + MemCopy ( YCol, XCol, localHeight ); } // Free the memory @@ -1879,65 +1818,61 @@ void AxpyInterface2::HandleLocalToGlobalData ( const Matrix& Z, Int source template void AxpyInterface2::HandleLocalToGlobalData( Matrix& Z, Int source ) { - HandleLocalToGlobalData( const_cast&>(Z), source ); + HandleLocalToGlobalData( const_cast&>( Z ), source ); } // replica of above function except this accumulates -template < typename T > +template void AxpyInterface2::HandleLocalToGlobalAcc ( const Matrix& Z, Int source ) { - DistMatrix &Y = *GlobalArrayPut_; - const Grid & g = Y.Grid (); + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int myRow = g.Row (); const Int myCol = g.Col (); const int height = Z.Height(); const int width = Z.Width(); - // post receive for coordinates Int coord[3]; - mpi::TaggedRecv (coord, 3, source, - COORD_ACC_TAG, g.VCComm()); - Int i = coord[0]; + mpi::TaggedRecv ( coord, 3, source, + COORD_ACC_TAG, g.VCComm() ); + Int i = coord[0]; Int j = coord[1]; Int count = coord[2]; - // data buffer std::vector getVector_; - getVector_.resize (count); + getVector_.resize ( count ); - DEBUG_ONLY (if (count < Int (sizeof (T))) - LogicError ("Count was too small");) - - DEBUG_ONLY (if (Int (getVector_.size ()) != count) - LogicError ("Not enough space allocated");) - - // post receive for data - T *getBuffer = getVector_.data(); - mpi::TaggedRecv (getBuffer, count, source, - DATA_ACC_TAG, g.VCComm()); + DEBUG_ONLY ( if ( count < Int ( sizeof ( T ) ) ) + LogicError ( "Count was too small" ); ) + DEBUG_ONLY ( if ( Int ( getVector_.size () ) != count ) + LogicError ( "Not enough space allocated" ); ) + // post receive for data + T* getBuffer = getVector_.data(); + mpi::TaggedRecv ( getBuffer, count, source, + DATA_ACC_TAG, g.VCComm() ); // Update Y - const T *XBuffer = const_cast < const T * >(getBuffer); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - const Int iLocalOffset = Length (i, Y.ColShift(), r); - const Int jLocalOffset = Length (j, Y.RowShift(), c); - - for (Int t = 0; t < localWidth; ++t) + const T* XBuffer = const_cast ( getBuffer ); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift ( myRow, colAlign, r ); + const Int rowShift = Shift ( myCol, rowAlign, c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + const Int iLocalOffset = Length ( i, Y.ColShift(), r ); + const Int jLocalOffset = Length ( j, Y.RowShift(), c ); + + for ( Int t = 0; t < localWidth; ++t ) { - T *YCol = Y.Buffer (iLocalOffset, jLocalOffset + t); - const T *XCol = &XBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[s] += XCol[s]; + T* YCol = Y.Buffer ( iLocalOffset, jLocalOffset + t ); + const T* XCol = &XBuffer[t * localHeight]; + + for ( Int s = 0; s < localHeight; ++s ) + YCol[s] += XCol[s]; } - + // Free the memory getVector_.clear(); } @@ -1945,17 +1880,17 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( const Matrix& Z, Int source template void AxpyInterface2::HandleLocalToGlobalAcc( Matrix& Z, Int source ) { - HandleLocalToGlobalAcc( const_cast&>(Z), source ); + HandleLocalToGlobalAcc( const_cast&>( Z ), source ); } // handle request for data, post a matching isend -template < typename T > +template void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) { - DEBUG_ONLY (CallStackEntry cse ("AxpyInterface::HandleGlobalToLocalData")) + DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::HandleGlobalToLocalData" ) ) if ( !toBeAttachedForGet_ ) - LogicError("Local matrix cannot be updated"); + LogicError( "Local matrix cannot be updated" ); const DistMatrix& Y = *GlobalArrayGet_; const Grid& g = Y.Grid(); @@ -1964,140 +1899,130 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int p = g.Size(); const Int myRow = g.Row(); const Int myCol = g.Col(); - Int i, j; Int matrix_index; std::vector recvVector_; - - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); - + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - for (Int step = 0; step < p; step++) + for ( Int step = 0; step < p; step++ ) { - mpi::Status status; - if (mpi::IProbe (mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // post receive for coordinates - Int coord[3]; - mpi::TaggedRecv (coord, 3, source, - REQUEST_GET_TAG, g.VCComm()); - i = coord[0]; - j = coord[1]; - - // we need the localwidth/height here, - // used also to calculate numEntries - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - const Int iLocalOffset = Length (i, Y.ColShift (), r); - const Int jLocalOffset = Length (j, Y.RowShift (), c); - - const Int numEntries = localHeight * localWidth; - - DEBUG_ONLY (if (numEntries < Int (sizeof (T))) - LogicError ("Count was too small");) - - const Int index = - NextIndexData (source, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[source][index].size ()) != - numEntries) LogicError ("Error in NextIndexData");) - - T *replyBuffer = matrices_[matrix_index].data_[source][index].data (); - - for (Int t = 0; t < localWidth; ++t) - { - T *sendCol = &replyBuffer[t * localHeight]; - const T *XCol = Y.LockedBuffer (iLocalOffset, jLocalOffset + t); - MemCopy (sendCol, XCol, localHeight); - } - - // Fire off non-blocking send - mpi::TaggedISend (replyBuffer, numEntries, source, - DATA_GET_TAG, g.VCComm (), - matrices_[matrix_index].requests_[source][index]); - } - - // receive data - if (mpi::IProbe - (mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status)) - { - const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount (status); - recvVector_.resize (count); - T *recvBuffer = recvVector_.data (); - - // Receive the data - mpi::TaggedRecv - (recvBuffer, count, source, DATA_GET_TAG, g.VCComm ()); - - // Compute the local heights and offsets - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int colAlign = (Y.ColAlign () + i) % r; - const Int rowAlign = (Y.RowAlign () + j) % c; - const Int colShift = Shift (myRow, colAlign, r); - const Int rowShift = Shift (myCol, rowAlign, c); - const Int localHeight = Length (height, colShift, r); - const Int localWidth = Length (width, rowShift, c); - - // Unpack the local matrix - for (Int t = 0; t < localWidth; ++t) - { - T *YCol = Z.Buffer (0, rowShift + t * c); - const T *XCol = &recvBuffer[t * localHeight]; - for (Int s = 0; s < localHeight; ++s) - YCol[colShift + s * r] = XCol[s]; - } - } + mpi::Status status; + + if ( mpi::IProbe ( mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status ) ) + { + const Int source = status.MPI_SOURCE; + // post receive for coordinates + Int coord[3]; + mpi::TaggedRecv ( coord, 3, source, + REQUEST_GET_TAG, g.VCComm() ); + i = coord[0]; + j = coord[1]; + // we need the localwidth/height here, + // used also to calculate numEntries + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift ( myRow, colAlign, r ); + const Int rowShift = Shift ( myCol, rowAlign, c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + const Int iLocalOffset = Length ( i, Y.ColShift (), r ); + const Int jLocalOffset = Length ( j, Y.RowShift (), c ); + const Int numEntries = localHeight * localWidth; + + DEBUG_ONLY ( if ( numEntries < Int ( sizeof ( T ) ) ) + LogicError ( "Count was too small" ); ) + const Int index = + NextIndexData ( source, + numEntries, + Buffer, + &matrix_index ); + + DEBUG_ONLY ( if + ( Int ( matrices_[matrix_index].data_[source][index].size () ) != + numEntries ) LogicError ( "Error in NextIndexData" ); ) + T* replyBuffer = matrices_[matrix_index].data_[source][index].data (); + + for ( Int t = 0; t < localWidth; ++t ) + { + T* sendCol = &replyBuffer[t * localHeight]; + const T* XCol = Y.LockedBuffer ( iLocalOffset, jLocalOffset + t ); + MemCopy ( sendCol, XCol, localHeight ); + } + + // Fire off non-blocking send + mpi::TaggedISend ( replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm (), + matrices_[matrix_index].requests_[source][index] ); + } + + // receive data + if ( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status ) ) + { + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount ( status ); + recvVector_.resize ( count ); + T* recvBuffer = recvVector_.data (); + // Receive the data + mpi::TaggedRecv + ( recvBuffer, count, source, DATA_GET_TAG, g.VCComm () ); + // Compute the local heights and offsets + const Int myRow = g.Row (); + const Int myCol = g.Col (); + const Int colAlign = ( Y.ColAlign () + i ) % r; + const Int rowAlign = ( Y.RowAlign () + j ) % c; + const Int colShift = Shift ( myRow, colAlign, r ); + const Int rowShift = Shift ( myCol, rowAlign, c ); + const Int localHeight = Length ( height, colShift, r ); + const Int localWidth = Length ( width, rowShift, c ); + + // Unpack the local matrix + for ( Int t = 0; t < localWidth; ++t ) + { + T* YCol = Z.Buffer ( 0, rowShift + t * c ); + const T* XCol = &recvBuffer[t * localHeight]; + + for ( Int s = 0; s < localHeight; ++s ) + YCol[colShift + s * r] = XCol[s]; + } + } } - + recvVector_.clear(); } -// detach collectively +// detach collectively template void AxpyInterface2::Detach() { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface2::Detach")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Detach" ) ) + // destructor will call detach again... - if (detached_) - return; + if ( detached_ ) + return; + if( !attached_ ) - LogicError("Must attach before detaching."); + LogicError( "Must attach before detaching." ); const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); - + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); mpi::Barrier( g.VCComm() ); - attached_ = false; detached_ = true; toBeAttachedForPut_ = false; toBeAttachedForGet_ = false; - GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - if (!dataVectors_.empty()) - dataVectors_.clear(); + if ( !dataVectors_.empty() ) + dataVectors_.clear(); matrices_.clear(); coords_.clear(); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index 8a69ab49f2..d7129ab04b 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -20,27 +20,23 @@ namespace El { template RmaInterface::RmaInterface() - : GlobalArrayPut_(0), GlobalArrayGet_(0), - matrices_(0), window (MPI_WIN_NULL), - putVector_(0), getVector_(0), - toBeAttachedForPut_(false), toBeAttachedForGet_(false), - attached_(false), detached_(true) + : GlobalArrayPut_( 0 ), GlobalArrayGet_( 0 ), + matrices_( 0 ), window ( MPI_WIN_NULL ), + putVector_( 0 ), getVector_( 0 ), + toBeAttachedForPut_( false ), toBeAttachedForGet_( false ), + attached_( false ), detached_( true ) { } template RmaInterface::RmaInterface( DistMatrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::RmaInterface" ) ) attached_ = false; detached_ = true; - toBeAttachedForGet_ = false; toBeAttachedForPut_ = false; - GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - window = MPI_WIN_NULL; } @@ -50,96 +46,87 @@ RmaInterface::RmaInterface( DistMatrix& Z ) template RmaInterface::RmaInterface( const DistMatrix& X ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::RmaInterface")) - + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::RmaInterface" ) ) attached_ = false; detached_ = true; - toBeAttachedForGet_ = false; toBeAttachedForPut_ = false; - GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - window = MPI_WIN_NULL; } template RmaInterface::~RmaInterface() { - { if( std::uncaught_exception() ) { std::ostringstream os; os << "Uncaught exception detected during RmaInterface destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << std::endl; + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; std::cerr << os.str(); - DEBUG_ONLY(DumpCallStack()) + DEBUG_ONLY( DumpCallStack() ) } else - { Detach(); - } - } } template void RmaInterface::Attach( DistMatrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Attach" ) ) + // attached_ will be only set in Attach // and only unset in Detach - if (!attached_ && detached_) + if ( !attached_ && detached_ ) { attached_ = true; - detached_ = false; + detached_ = false; } else - LogicError("Must detach before reattaching."); + LogicError( "Must detach before reattaching." ); // if DistMatrix is non-const, all one-sided // transfers -- put, get and acc are possible if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) { - GlobalArrayPut_ = &Z; - toBeAttachedForPut_ = true; - GlobalArrayGet_ = &Z; - toBeAttachedForGet_ = true; - - const Grid& g = Z.Grid(); - const Int p = g.Size (); - - if ( matrices_.empty() ) - { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - } - - if (putVector_.empty()) - { - getVector_.resize( p ); - putVector_.resize( p ); - } - - // TODO rma related checks - // creation of window - const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); - const Int bufferSize = numEntries * sizeof(T); - void* baseptr = reinterpret_cast< void * >(Z.Buffer ()); - assert(baseptr != NULL); - - mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); - mpi::WindowLock (window); + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + const Grid& g = Z.Grid(); + const Int p = g.Size (); + + if ( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if ( putVector_.empty() ) + { + getVector_.resize( p ); + putVector_.resize( p ); + } + + // TODO rma related checks + // creation of window + const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); + const Int bufferSize = numEntries * sizeof( T ); + void* baseptr = reinterpret_cast( Z.Buffer () ); + assert( baseptr != NULL ); + mpi::WindowCreate ( baseptr, bufferSize, g.VCComm (), window ); + mpi::WindowLock ( window ); } } @@ -147,139 +134,134 @@ void RmaInterface::Attach( DistMatrix& Z ) template void RmaInterface::Attach( const DistMatrix& X ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Attach")) - if (!attached_ && detached_) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Attach" ) ) + + if ( !attached_ && detached_ ) { attached_ = true; - detached_ = false; + detached_ = false; } else - LogicError("Must detach before reattaching."); + LogicError( "Must detach before reattaching." ); if( !toBeAttachedForGet_ ) { - GlobalArrayGet_ = &X; - toBeAttachedForGet_ = true; - GlobalArrayPut_ = 0; - toBeAttachedForPut_ = false; - - const Grid& g = X.Grid(); - const Int p = g.Size (); - - if (getVector_.size() != p) - { - getVector_.resize( p ); - } - - //TODO rma related checks - const Int numEntries = X.LocalHeight () * X.LocalWidth (); - const Int bufferSize = numEntries * sizeof(T); - void* baseptr = static_cast(const_cast(X.LockedBuffer ())); - assert (baseptr != NULL); - - mpi::WindowCreate (baseptr, bufferSize, g.VCComm (), window); - mpi::WindowLock (window); + GlobalArrayGet_ = &X; + toBeAttachedForGet_ = true; + GlobalArrayPut_ = 0; + toBeAttachedForPut_ = false; + const Grid& g = X.Grid(); + const Int p = g.Size (); + + if ( getVector_.size() != p ) + getVector_.resize( p ); + + //TODO rma related checks + const Int numEntries = X.LocalHeight () * X.LocalWidth (); + const Int bufferSize = numEntries * sizeof( T ); + void* baseptr = static_cast( const_cast( X.LockedBuffer () ) ); + assert ( baseptr != NULL ); + mpi::WindowCreate ( baseptr, bufferSize, g.VCComm (), window ); + mpi::WindowLock ( window ); } } // for standard passive rma template Int RmaInterface::NextIndex -( Int dataSize, - std::deque > &dataVectors) +( Int dataSize, + std::deque >& dataVectors ) { - DEBUG_ONLY (CallStackEntry cse ("RmaInterface::NextIndex")) - + DEBUG_ONLY ( CallStackEntry cse ( "RmaInterface::NextIndex" ) ) const Int numCreated = dataVectors.size (); - - dataVectors.resize (numCreated + 1); - dataVectors[numCreated].resize (dataSize); - + dataVectors.resize ( numCreated + 1 ); + dataVectors[numCreated].resize ( dataSize ); return numCreated; } // for request-based passive rma template Int RmaInterface::NextIndex ( - Int target, - Int dataSize, - const void* base_address, - Int *mindex) + Int target, + Int dataSize, + const void* base_address, + Int* mindex ) { - DEBUG_ONLY (CallStackEntry cse ("RmaInterface::NextIndex")) - + DEBUG_ONLY ( CallStackEntry cse ( "RmaInterface::NextIndex" ) ) assert ( base_address != NULL ); - Int matrixIndex = 0; const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); const Int numMatrices = matrices_.size(); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - // uninitiated, first time - if ( matrices_[m].base_ == NULL ) - { - matrices_[m].base_ = base_address; - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + // uninitiated, first time + if ( matrices_[m].base_ == NULL ) + { + matrices_[m].base_ = base_address; + matrixIndex = m; + break; + } + + matrixIndex = m+1; } - + // need to create new object - if ( matrixIndex == numMatrices) + if ( matrixIndex == numMatrices ) { - struct matrix_params_ mp; - mp.data_.resize(p); - mp.requests_.resize(p); - mp.statuses_.resize(p); - mp.base_ = NULL; - // push back new matrix_params created - // with default constructor - matrices_.push_back( mp ); - matrices_[matrixIndex].base_ = base_address; + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + matrices_[matrixIndex].base_ = base_address; } - // go through the request, data, + + // go through the request, data, // status objects const Int numCreated = matrices_[matrixIndex].data_[target].size (); - DEBUG_ONLY (if (numCreated != Int (matrices_[matrixIndex].requests_[target].size ()) || - numCreated != Int (matrices_[matrixIndex].statuses_[target].size ())) - LogicError ("size mismatch");) - - for (Int i = 0; i < numCreated; ++i) - { - // If this request is still running, - // test to see if it finished. - if (matrices_[matrixIndex].statuses_[target][i]) - { - const bool finished = mpi::Test (matrices_[matrixIndex].requests_[target][i]); - matrices_[matrixIndex].statuses_[target][i] = !finished; - } - - if (!matrices_[matrixIndex].statuses_[target][i]) - { - matrices_[matrixIndex].statuses_[target][i] = true; - matrices_[matrixIndex].data_[target][i].resize ( dataSize ); - *mindex = matrixIndex; - return i; - } - } + + DEBUG_ONLY ( if ( numCreated != Int ( matrices_[matrixIndex].requests_[target].size () ) || + numCreated != Int ( matrices_[matrixIndex].statuses_[target].size () ) ) + LogicError ( "size mismatch" ); ) + + for ( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, + // test to see if it finished. + if ( matrices_[matrixIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test ( matrices_[matrixIndex].requests_[target][i] ); + matrices_[matrixIndex].statuses_[target][i] = !finished; + } + + if ( !matrices_[matrixIndex].statuses_[target][i] ) + { + matrices_[matrixIndex].statuses_[target][i] = true; + matrices_[matrixIndex].data_[target][i].resize ( dataSize ); + *mindex = matrixIndex; + return i; + } + } matrices_[matrixIndex].data_[target].resize ( numCreated + 1 ); matrices_[matrixIndex].data_[target][numCreated].resize ( dataSize ); matrices_[matrixIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); matrices_[matrixIndex].statuses_[target].push_back ( true ); *mindex = matrixIndex; - return numCreated; } @@ -287,17 +269,19 @@ Int RmaInterface::NextIndex ( template void RmaInterface::Rput( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Rput")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Rput" ) ) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; + //do rma related checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -305,24 +289,19 @@ void RmaInterface::Rput( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - const Int YLDim = Y.LDim (); - const T* XBuffer = Z.LockedBuffer(); - const void* Buffer = static_cast(const_cast(Z.LockedBuffer())); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); Int matrix_index; for( Int step=0; step::Rput( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = - NextIndex (destination, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - - T *sendBuffer = reinterpret_cast(matrices_[matrix_index].data_[destination][index].data ()); - + const Int index = + NextIndex ( destination, + numEntries, + Buffer, + &matrix_index ); + + DEBUG_ONLY ( if + ( Int ( matrices_[matrix_index].data_[destination][index].size () ) != + numEntries ) LogicError ( "Error in NextIndex" ); ) + + T* sendBuffer = reinterpret_cast( matrices_[matrix_index].data_[destination][index].data () ); + for( Int t=0; t void RmaInterface::Rput( Matrix& Z, Int i, Int j ) -{ - Rput( const_cast&>(Z), i, j ); -} +{ Rput( const_cast&>( Z ), i, j ); } // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template void RmaInterface::Racc( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Racc")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Racc" ) ) if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated."); + LogicError( "Global matrix cannot be updated." ); + if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative."); + LogicError( "Submatrix offsets must be non-negative." ); DistMatrix& Y = *GlobalArrayPut_; if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix."); + LogicError( "Submatrix out of bounds of global matrix." ); //do rma related checks const Grid& g = Y.Grid(); @@ -399,23 +380,18 @@ void RmaInterface::Racc( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); const Int YLDim = Y.LDim (); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - - const T* XBuffer = Z.LockedBuffer(); - const void* Buffer = static_cast < void * >(const_cast < T * >(Z.LockedBuffer())); - - Int matrix_index; - + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); + Int matrix_index; const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -431,59 +407,62 @@ void RmaInterface::Racc( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = - NextIndex (destination, - numEntries, - Buffer, - &matrix_index); - - DEBUG_ONLY (if - (Int (matrices_[matrix_index].data_[destination][index].size ()) != - numEntries) LogicError ("Error in NextIndex");) - - T *sendBuffer = reinterpret_cast(matrices_[matrix_index].data_[destination][index].data ()); - + const Int index = + NextIndex ( destination, + numEntries, + Buffer, + &matrix_index ); + + DEBUG_ONLY ( if + ( Int ( matrices_[matrix_index].data_[destination][index].size () ) != + numEntries ) LogicError ( "Error in NextIndex" ); ) + + T* sendBuffer = reinterpret_cast( matrices_[matrix_index].data_[destination][index].data () ); + for( Int t=0; t void RmaInterface::Racc( Matrix& Z, Int i, Int j ) -{ - Racc( const_cast&>(Z), i, j ); -} +{ Racc( const_cast&>( Z ), i, j ); } -// Locally Blocking +// Locally Blocking template void RmaInterface::Put( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Put")) - + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Put" ) ) + if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; + //do rma related checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -491,22 +470,17 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - const Int YLDim = Y.LDim (); - const T* XBuffer = Z.LockedBuffer(); for( Int step=0; step::Put( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = - NextIndex ( numEntries, - putVector_[destination]); - - T* sendBuffer = putVector_[destination][index].data(); + const Int index = + NextIndex ( numEntries, + putVector_[destination] ); + T* sendBuffer = putVector_[destination][index].data(); for( Int t=0; t void RmaInterface::Put( Matrix& Z, Int i, Int j ) -{ - Put( const_cast&>(Z), i, j ); -} +{ Put( const_cast&>( Z ), i, j ); } // accumulate = Update Y(i:i+height-1,j:j+width-1) += X, // where X is height x width template void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Acc")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Acc" ) ) if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated."); + LogicError( "Global matrix cannot be updated." ); + if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative."); + LogicError( "Submatrix offsets must be non-negative." ); DistMatrix& Y = *GlobalArrayPut_; if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix."); + LogicError( "Submatrix out of bounds of global matrix." ); //do rma related checks - const Grid& g = Y.Grid(); const Int r = g.Height(); const Int c = g.Width(); const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); const Int YLDim = Y.LDim (); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const T* XBuffer = Z.LockedBuffer(); - + for( Int step=0; step::Acc( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, - putVector_[destination]); - + const Int index = RmaInterface::NextIndex ( numEntries, + putVector_[destination] ); T* sendBuffer = putVector_[destination][index].data(); for( Int t=0; t void RmaInterface::Acc( Matrix& Z, Int i, Int j ) -{ - Acc( const_cast&>(Z), i, j ); -} +{ Acc( const_cast&>( Z ), i, j ); } // TODO Iget and Rget template void RmaInterface::Get( Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Get")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Get" ) ) + // a call to Attach with a non-const DistMatrix must set // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached if ( !toBeAttachedForGet_ ) - LogicError ("Cannot perform this operation as matrix is not attached."); + LogicError ( "Cannot perform this operation as matrix is not attached." ); - const DistMatrix &X = *GlobalArrayGet_; - - const Grid & g = X.Grid (); + const DistMatrix& X = *GlobalArrayGet_; + const Grid& g = X.Grid (); const Int r = g.Height (); const Int c = g.Width (); const Int p = g.Size (); @@ -658,22 +631,18 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) const Int myCol = g.Col (); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - // local width and height const Int height = Z.Height(); const Int width = Z.Width(); - if (i + height > X.Height () || j + width > X.Width ()) - LogicError("Submatrix out of bounds of global matrix"); - - const Int colAlign = (X.ColAlign() + i) % r; - const Int rowAlign = (X.RowAlign() + j) % c; - - const Int iLocalOffset = Length (i, X.ColShift (), r); - const Int jLocalOffset = Length (j, X.RowShift (), c); + if ( i + height > X.Height () || j + width > X.Width () ) + LogicError( "Submatrix out of bounds of global matrix" ); + const Int colAlign = ( X.ColAlign() + i ) % r; + const Int rowAlign = ( X.RowAlign() + j ) % c; + const Int iLocalOffset = Length ( i, X.ColShift (), r ); + const Int jLocalOffset = Length ( j, X.RowShift (), c ); const Int XLDim = X.LDim (); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -688,33 +657,37 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, - getVector_[destination]); - T *getBuffer = getVector_[destination][index].data (); + const Int index = RmaInterface::NextIndex ( numEntries, + getVector_[destination] ); + T* getBuffer = getVector_[destination][index].data (); // get for( Int t=0; t::Get( Matrix& Z, Int i, Int j ) template void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iput")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Iput" ) ) if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); + LogicError( "Submatrix offsets must be non-negative" ); + if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated"); + LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; + //do rma related checks if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -740,22 +715,17 @@ void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - const Int YLDim = Y.LDim (); - const T* XBuffer = Z.LockedBuffer(); for( Int step=0; step::Iput( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = - NextIndex ( - numEntries, - putVector_[destination] - ); - + const Int index = + NextIndex ( + numEntries, + putVector_[destination] + ); T* sendBuffer = putVector_[destination][index].data(); for( Int t=0; t::Iput( const Matrix& Z, Int i, Int j ) template void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Iacc")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Iacc" ) ) if ( !toBeAttachedForPut_ ) - LogicError("Global matrix cannot be updated."); + LogicError( "Global matrix cannot be updated." ); + if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative."); + LogicError( "Submatrix offsets must be non-negative." ); DistMatrix& Y = *GlobalArrayPut_; if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix."); + LogicError( "Submatrix out of bounds of global matrix." ); //TODO rma related checks const Grid& g = Y.Grid(); @@ -820,20 +794,16 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); const Int YLDim = Y.LDim (); // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - const T* XBuffer = Z.LockedBuffer(); - const Int iLocalOffset = Length( i, Y.ColShift (), r ); const Int jLocalOffset = Length( j, Y.RowShift (), c ); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -849,73 +819,72 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = - NextIndex ( - numEntries, - putVector_[destination]); - + const Int index = + NextIndex ( + numEntries, + putVector_[destination] ); T* sendBuffer = putVector_[destination][index].data(); for( Int t=0; t void RmaInterface::Iput( Matrix& Z, Int i, Int j ) -{ - Iput( const_cast&>(Z), i, j ); -} +{ Iput( const_cast&>( Z ), i, j ); } template void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) -{ - Iacc( const_cast&>(Z), i, j ); -} +{ Iacc( const_cast&>( Z ), i, j ); } -// Local completion of all ops upon +// Local completion of all ops upon // return template void RmaInterface::LocalFlush() { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::LocalFlush")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::LocalFlush" ) ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); + LogicError( "Must initiate transfer before flushing." ); mpi::FlushLocal ( window ); } -// Local completion (specific to Z) upon +// Local completion (specific to Z) upon // return template void RmaInterface::LocalFlush( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::LocalFlush")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::LocalFlush" ) ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); + LogicError( "Must initiate transfer before flushing." ); // if there are no request based RMA pending // for Z, then this functions acts like Flush // local all if ( !anyPendingXfers( Z ) ) - { - LocalFlush(); - } + LocalFlush(); else - Wait ( Z ); + Wait ( Z ); } // there is no use as of now in @@ -925,226 +894,225 @@ void RmaInterface::LocalFlush( Matrix& Z ) template void RmaInterface::Flush( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Flush")) - + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Flush" ) ) + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer before flushing."); - + LogicError( "Must initiate transfer before flushing." ); + mpi::Flush ( window ); } template bool RmaInterface::anyPendingXfers ( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::anyPendingXfers")) - - // by default, number of matrices + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::anyPendingXfers" ) ) + // by default, number of matrices // == number of processes Int matrixIndex; const Int numMatrices = matrices_.size(); - const void *base_address = static_cast(const_cast(Z.LockedBuffer())); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } // matrix not found if ( matrixIndex == numMatrices ) - return false; + return false; return true; } // waitany implementation // cannot use mpi::Waitany -// as of now because request +// as of now because request // objects are vector of deques template void RmaInterface::WaitAny( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::WaitAny")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::WaitAny" ) ) - const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex; - const Int numMatrices = matrices_.size(); - - const void *base_address = static_cast(const_cast(Z.LockedBuffer())); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } // matrix not found if ( matrixIndex == numMatrices ) - return; + return; // data - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - if (!matrices_[matrixIndex].statuses_[rank][i]) - { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); - matrices_[matrixIndex].statuses_[rank][i] = true; - return; - } - } + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + if ( !matrices_[matrixIndex].statuses_[rank][i] ) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + return; + } + } } } template void RmaInterface::Wait( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Wait")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Wait" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex; - const Int numMatrices = matrices_.size(); - - const void *base_address = static_cast(const_cast(Z.LockedBuffer())); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } // matrix not found if ( matrixIndex == numMatrices ) - return; + return; // data - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); - matrices_[matrixIndex].statuses_[rank][i] = true; - } + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } } } template void RmaInterface::Waitall () { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Waitall")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Waitall" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex; - const Int numMatrices = matrices_.size(); // data - for (int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex) + for ( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) { - for (int rank = 0; rank < p; ++rank) - { - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); - matrices_[matrixIndex].statuses_[rank][i] = true; - } - } + for ( int rank = 0; rank < p; ++rank ) + { + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } } } template bool RmaInterface::Test( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Test")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Test" ) ) - const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex; - const Int numMatrices = matrices_.size(); - - const void *base_address = static_cast(const_cast(Z.LockedBuffer())); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } - + // matrix not found if ( matrixIndex == numMatrices ) - return true; + return true; - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); - if (matrices_[matrixIndex].statuses_[rank][i]) - return false; - } + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + + if ( matrices_[matrixIndex].statuses_[rank][i] ) + return false; + } } return true; @@ -1157,52 +1125,52 @@ bool RmaInterface::Test( Matrix& Z ) template bool RmaInterface::TestAny( Matrix& Z ) { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::TestAny")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::TestAny" ) ) - const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); Int matrixIndex; - const Int numMatrices = matrices_.size(); - - const void *base_address = static_cast(const_cast(Z.LockedBuffer())); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for (Int m = 0; m < numMatrices; m++) + for ( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) - { - matrixIndex = m; - break; - } - matrixIndex = m+1; + if ( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; } - + // matrix not found if ( matrixIndex == numMatrices ) - return true; + return true; - for (int rank = 0; rank < p; ++rank) + for ( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); - if (matrices_[matrixIndex].statuses_[rank][i]) - continue; - else - return true; - } + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + + if ( matrices_[matrixIndex].statuses_[rank][i] ) + continue; + else + return true; + } } return false; @@ -1211,35 +1179,36 @@ bool RmaInterface::TestAny( Matrix& Z ) template bool RmaInterface::Testall() { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Testall")) - if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) - LogicError("Must initiate transfer at first."); + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Testall" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); const Grid& g = ( toBeAttachedForPut_ ? - GlobalArrayPut_->Grid() : - GlobalArrayGet_->Grid() ); + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); const Int p = g.Size(); - const Int numMatrices = matrices_.size(); - + // data - for (int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex) - { - for (int rank = 0; rank < p; ++rank) - { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) - continue; - - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); - - for (int i = 0; i < numDataStatuses; i++) - { - matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test (matrices_[matrixIndex].requests_[rank][i]); - if (matrices_[matrixIndex].statuses_[rank][i]) - return false; - } - } + for ( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + { + for ( int rank = 0; rank < p; ++rank ) + { + if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + + for ( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + + if ( matrices_[matrixIndex].statuses_[rank][i] ) + return false; + } + } } return true; @@ -1248,33 +1217,30 @@ bool RmaInterface::Testall() template void RmaInterface::Detach() { - DEBUG_ONLY(CallStackEntry cse("RmaInterface::Detach")) + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Detach" ) ) + // destructor will call detach again... - if (detached_) + if ( detached_ ) return; + if( !attached_ ) - LogicError("Must attach before detaching."); + LogicError( "Must attach before detaching." ); const Grid& g = ( toBeAttachedForPut_ ? GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); - mpi::Barrier( g.VCComm() ); - attached_ = false; detached_ = true; toBeAttachedForPut_ = false; toBeAttachedForGet_ = false; - GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - putVector_.clear(); getVector_.clear(); matrices_.clear(); - - mpi::WindowUnlock (window); - mpi::WindowFree (window); + mpi::WindowUnlock ( window ); + mpi::WindowFree ( window ); } #define PROTO(T) template class RmaInterface; From 0bf9fc551e0e434f00a4bc90c07c867bd3ea77ce Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 25 Feb 2015 13:47:13 -0800 Subject: [PATCH 109/110] final round of indentation checks --- src/core/AxpyInterface.cpp | 701 ++++++++++---------- src/core/AxpyInterface2.0.cpp | 1148 ++++++++++++++++----------------- src/core/RmaInterface.cpp | 440 +++++++------ 3 files changed, 1161 insertions(+), 1128 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index ea3a38880d..0eb4629f91 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -17,20 +17,22 @@ namespace El { #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else -template bool AxpyInterface ::Finished () +template bool AxpyInterface ::Finished() { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Finished" ); - if ( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) - LogicError ( "Not attached" ); ) - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid () ); - const Int p = g.Size (); + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Finished" ); + + if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) + LogicError( "Not attached" ); ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid() : + globalToLocalMat_->Grid() ); + + const Int p = g.Size(); bool finished = true; - for ( Int rank = 0; rank < p; ++rank ) + for( Int rank = 0; rank < p; ++rank ) { - if ( !sentEomTo_[rank] || !haveEomFrom_[rank] ) + if( !sentEomTo_[rank] || !haveEomFrom_[rank] ) { finished = false; break; @@ -40,59 +42,59 @@ template bool AxpyInterface ::Finished () return finished; } -template void AxpyInterface ::HandleEoms () +template void AxpyInterface ::HandleEoms() { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::HandleEoms" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::HandleEoms" ) ) const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid () ); - const Int p = g.Size (); - UpdateRequestStatuses (); + localToGlobalMat_->Grid() : + globalToLocalMat_->Grid() ); + const Int p = g.Size(); + UpdateRequestStatuses(); // Try to progress our EOM sends - for ( Int i = 0; i < p; ++i ) + for( Int i = 0; i < p; ++i ) { - if ( !sentEomTo_[i] ) + if( !sentEomTo_[i] ) { bool shouldSendEom = true; - const Int numSends = sendingData_[i].size (); + const Int numSends = sendingData_[i].size(); - for ( Int j = 0; j < numSends; ++j ) + for( Int j = 0; j < numSends; ++j ) { - if ( sendingData_[i][j] ) + if( sendingData_[i][j] ) { shouldSendEom = false; break; } } - const Int numRequests = sendingRequest_[i].size (); + const Int numRequests = sendingRequest_[i].size(); - for ( Int j = 0; j < numRequests; ++j ) + for( Int j = 0; j < numRequests; ++j ) { - if ( !shouldSendEom || sendingRequest_[i][j] ) + if( !shouldSendEom || sendingRequest_[i][j] ) { shouldSendEom = false; break; } } - const Int numReplies = sendingReply_[i].size (); + const Int numReplies = sendingReply_[i].size(); - for ( Int j = 0; j < numReplies; ++j ) + for( Int j = 0; j < numReplies; ++j ) { - if ( !shouldSendEom || sendingReply_[i][j] ) + if( !shouldSendEom || sendingReply_[i][j] ) { shouldSendEom = false; break; } } - if ( shouldSendEom ) + if( shouldSendEom ) { mpi::Request& request = eomSendRequests_[i]; mpi::TaggedISSend - ( &sendDummy_, 1, i, EOM_TAG, g.VCComm (), request ); + ( &sendDummy_, 1, i, EOM_TAG, g.VCComm(), request ); sentEomTo_[i] = true; } } @@ -100,52 +102,50 @@ template void AxpyInterface ::HandleEoms () mpi::Status status; - if ( mpi::IProbe ( mpi::ANY_SOURCE, EOM_TAG, g.VCComm (), status ) ) + if( mpi::IProbe( mpi::ANY_SOURCE, EOM_TAG, g.VCComm(), status ) ) { const Int source = status.MPI_SOURCE; - mpi::TaggedRecv ( &recvDummy_, 1, source, EOM_TAG, g.VCComm () ); + mpi::TaggedRecv( &recvDummy_, 1, source, EOM_TAG, g.VCComm() ); haveEomFrom_[source] = true; } } #endif // EL_USE_IBARRIER_FOR_AXPY -template void AxpyInterface ::HandleLocalToGlobalData () +template void AxpyInterface ::HandleLocalToGlobalData() { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::HandleLocalToGlobalData" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::HandleLocalToGlobalData" ) ) DistMatrix & Y = *localToGlobalMat_; - const Grid& g = Y.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); mpi::Status status; - if ( mpi::IProbe ( mpi::ANY_SOURCE, DATA_TAG, g.VCComm (), status ) ) + if( mpi::IProbe( mpi::ANY_SOURCE, DATA_TAG, g.VCComm(), status ) ) { // Message exists, so recv and pack - const Int count = mpi::GetCount ( status ); + const Int count = mpi::GetCount( status ); - DEBUG_ONLY ( if ( count < Int ( 4 * sizeof ( Int ) + sizeof ( T ) ) ) - LogicError ( "Count was too small" ); ) - ; + DEBUG_ONLY( if( count < Int( 4 * sizeof( Int ) + sizeof( T ) ) ) + LogicError( "Count was too small" ); ) const Int source = status.MPI_SOURCE; - recvVector_.resize ( count ); - byte* recvBuffer = recvVector_.data (); - mpi::TaggedRecv ( recvBuffer, count, source, DATA_TAG, g.VCComm () ); + recvVector_.resize( count ); + byte* recvBuffer = recvVector_.data(); + mpi::TaggedRecv( recvBuffer, count, source, DATA_TAG, g.VCComm() ); // Extract the header byte* head = recvBuffer; const Int i = *reinterpret_cast ( head ); - head += sizeof ( Int ); + head += sizeof( Int ); const Int j = *reinterpret_cast ( head ); - head += sizeof ( Int ); + head += sizeof( Int ); const Int height = *reinterpret_cast ( head ); - head += sizeof ( Int ); + head += sizeof( Int ); const Int width = *reinterpret_cast ( head ); - head += sizeof ( Int ); + head += sizeof( Int ); const T alpha = *reinterpret_cast ( head ); - head += sizeof ( T ); - + head += sizeof( T ); DEBUG_ONLY ( if ( height < 0 || width < 0 ) RuntimeError ( "Unpacked heights were negative:\n", @@ -168,101 +168,102 @@ template void AxpyInterface ::HandleLocalToGlobalData () " i= ", i, std::hex, "(", i, ")\n", std::dec, " j= ", j, std::hex, "(", j, ")\n", std::dec, " height=", height, std::hex, "(", height, ")\n", - std::dec, " width= ", width, std::hex, "(", width, ")\n", - std::dec, " alpha= ", alpha ); ) - - // Update Y + std::dec, " width= ", width, std::hex, "(", width, + ")\n", std::dec, " alpha= ", alpha ); + ) + + // Update Y const T* XBuffer = reinterpret_cast ( head ); - const Int colAlign = ( Y.ColAlign () + i ) % r; - const Int rowAlign = ( Y.RowAlign () + j ) % c; - const Int colShift = Shift ( myRow, colAlign, r ); - const Int rowShift = Shift ( myCol, rowAlign, c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); - const Int iLocalOffset = Length ( i, Y.ColShift (), r ); - const Int jLocalOffset = Length ( j, Y.RowShift (), c ); - - for ( Int t = 0; t < localWidth; ++t ) + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + + for( Int t = 0; t < localWidth; ++t ) { - T* YCol = Y.Buffer ( iLocalOffset, jLocalOffset + t ); + T* YCol = Y.Buffer( iLocalOffset, jLocalOffset + t ); const T* XCol = &XBuffer[t * localHeight]; - for ( Int s = 0; s < localHeight; ++s ) + for( Int s = 0; s < localHeight; ++s ) YCol[s] += alpha * XCol[s]; } // Free the memory for the recv buffer - recvVector_.clear (); + recvVector_.clear(); } } template -void AxpyInterface ::HandleGlobalToLocalRequest () +void AxpyInterface ::HandleGlobalToLocalRequest() { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::HandleGlobalToLocalRequest" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::HandleGlobalToLocalRequest" ) ) const DistMatrix & X = *globalToLocalMat_; - const Grid& g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); + const Grid& g = X.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); mpi::Status status; - if ( mpi::IProbe ( mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm (), status ) ) + if( mpi::IProbe( mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm(), status ) ) { // Request exists, so recv const Int source = status.MPI_SOURCE; - const Int recvSize = 4 * sizeof ( Int ); - recvVector_.resize ( recvSize ); + const Int recvSize = 4 * sizeof( Int ); + recvVector_.resize( recvSize ); + byte* recvBuffer = recvVector_.data(); - byte* recvBuffer = recvVector_.data (); mpi::TaggedRecv - ( recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm () ); + ( recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm() ); // Extract the header const byte* recvHead = recvBuffer; const Int i = *reinterpret_cast ( recvHead ); - recvHead += sizeof ( Int ); + recvHead += sizeof( Int ); const Int j = *reinterpret_cast ( recvHead ); - recvHead += sizeof ( Int ); + recvHead += sizeof( Int ); const Int height = *reinterpret_cast ( recvHead ); - recvHead += sizeof ( Int ); + recvHead += sizeof( Int ); const Int width = *reinterpret_cast ( recvHead ); - recvHead += sizeof ( Int ); + recvHead += sizeof( Int ); - const Int colAlign = ( X.ColAlign () + i ) % r; - const Int rowAlign = ( X.RowAlign () + j ) % c; - const Int colShift = Shift ( myRow, colAlign, r ); - const Int rowShift = Shift ( myCol, rowAlign, c ); - const Int iLocalOffset = Length ( i, X.ColShift (), r ); - const Int jLocalOffset = Length ( j, X.RowShift (), c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); + const Int colAlign = ( X.ColAlign() + i ) % r; + const Int rowAlign = ( X.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int iLocalOffset = Length( i, X.ColShift(), r ); + const Int jLocalOffset = Length( j, X.RowShift(), c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - const Int bufferSize = 2 * sizeof ( Int ) + numEntries * sizeof ( T ); - const Int index = ReadyForSend ( bufferSize, replyVectors_[source], - replySendRequests_[source], - sendingReply_[source] ); + const Int bufferSize = 2 * sizeof( Int ) + numEntries * sizeof( T ); + const Int index = ReadyForSend( bufferSize, replyVectors_[source], + replySendRequests_[source], + sendingReply_[source] ); // Pack the reply header - byte* sendBuffer = replyVectors_[source][index].data (); + byte* sendBuffer = replyVectors_[source][index].data(); byte* sendHead = sendBuffer; *reinterpret_cast ( sendHead ) = myRow; - sendHead += sizeof ( Int ); + sendHead += sizeof( Int ); *reinterpret_cast ( sendHead ) = myCol; - sendHead += sizeof ( Int ); + sendHead += sizeof( Int ); // Pack the payload T* sendData = reinterpret_cast ( sendHead ); - for ( Int t = 0; t < localWidth; ++t ) + for( Int t = 0; t < localWidth; ++t ) { T* sendCol = &sendData[t * localHeight]; - const T* XCol = X.LockedBuffer ( iLocalOffset, jLocalOffset + t ); - MemCopy ( sendCol, XCol, localHeight ); + const T* XCol = X.LockedBuffer( iLocalOffset, jLocalOffset + t ); + MemCopy( sendCol, XCol, localHeight ); } // Fire off non-blocking send mpi::TaggedISSend - ( sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm (), + ( sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm(), replySendRequests_[source][index] ); } } @@ -302,6 +303,7 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) haveEomFrom_.resize( p, false ); eomSendRequests_.resize( p ); #endif + sendingData_.resize( p ); sendingRequest_.resize( p ); sendingReply_.resize( p ); @@ -331,37 +333,37 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) globalToLocalMat_ = &X; } - const Int p = X.Grid ().Size (); + const Int p = X.Grid().Size(); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else - sentEomTo_.resize ( p, false ); - haveEomFrom_.resize ( p, false ); - eomSendRequests_.resize ( p ); + sentEomTo_.resize( p, false ); + haveEomFrom_.resize( p, false ); + eomSendRequests_.resize( p ); #endif - sendingData_.resize ( p ); - sendingRequest_.resize ( p ); - sendingReply_.resize ( p ); - - dataSendRequests_.resize ( p ); - requestSendRequests_.resize ( p ); - replySendRequests_.resize ( p ); + sendingData_.resize( p ); + sendingRequest_.resize( p ); + sendingReply_.resize( p ); + + dataSendRequests_.resize( p ); + requestSendRequests_.resize( p ); + replySendRequests_.resize( p ); - dataVectors_.resize ( p ); - requestVectors_.resize ( p ); - replyVectors_.resize ( p ); + dataVectors_.resize( p ); + requestVectors_.resize( p ); + replyVectors_.resize( p ); } -template AxpyInterface ::~AxpyInterface () +template AxpyInterface ::~AxpyInterface() { - if ( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) + if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) { - if ( std::uncaught_exception () ) + if( std::uncaught_exception() ) { const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid () ); + localToGlobalMat_->Grid() : + globalToLocalMat_->Grid() ); std::ostringstream os; - os << g.Rank () + os << g.Rank() << "Uncaught exception detected during AxpyInterface destructor " "that required a call to Detach. Instead of allowing for the " @@ -369,25 +371,25 @@ template AxpyInterface ::~AxpyInterface () "resulting in a 'terminate', we instead immediately dump the " "call stack (if not in RELEASE mode) since the program will " "likely hang:" << std::endl; - std::cerr << os.str (); - DEBUG_ONLY ( DumpCallStack () ) + std::cerr << os.str(); + DEBUG_ONLY( DumpCallStack() ) } else - Detach (); + Detach(); } } template -void AxpyInterface ::Attach ( AxpyType type, DistMatrix & Z ) +void AxpyInterface ::Attach( AxpyType type, DistMatrix & Z ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Attach" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Attach" ) ) - if ( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) - LogicError ( "Must detach before reattaching." ); + if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) + LogicError( "Must detach before reattaching." ); - const Grid& g = Z.Grid (); + const Grid& g = Z.Grid(); - if ( type == LOCAL_TO_GLOBAL ) + if( type == LOCAL_TO_GLOBAL ) { attachedForLocalToGlobal_ = true; localToGlobalMat_ = &Z; @@ -398,91 +400,97 @@ void AxpyInterface ::Attach ( AxpyType type, DistMatrix & Z ) globalToLocalMat_ = &Z; } - const Int p = Z.Grid ().Size (); + const Int p = Z.Grid().Size(); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else - sentEomTo_.resize ( p, false ); - haveEomFrom_.resize ( p, false ); - eomSendRequests_.resize ( p ); + sentEomTo_.resize( p, false ); + haveEomFrom_.resize( p, false ); + eomSendRequests_.resize( p ); #endif + // request objects - sendingRequest_.resize ( p ); - sendingData_.resize ( p ); - sendingReply_.resize ( p ); + sendingRequest_.resize( p ); + sendingData_.resize( p ); + sendingReply_.resize( p ); + // ready-to-send - requestSendRequests_.resize ( p ); - replySendRequests_.resize ( p ); - dataSendRequests_.resize ( p ); + requestSendRequests_.resize( p ); + replySendRequests_.resize( p ); + dataSendRequests_.resize( p ); + // data - dataVectors_.resize ( p ); - requestVectors_.resize ( p ); - replyVectors_.resize ( p ); + dataVectors_.resize( p ); + requestVectors_.resize( p ); + replyVectors_.resize( p ); } template -void AxpyInterface ::Attach ( AxpyType type, - const DistMatrix & X ) +void AxpyInterface ::Attach( AxpyType type, + const DistMatrix & X ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Attach" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Attach" ) ) - if ( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) - LogicError ( "Must detach before reattaching." ); + if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) + LogicError( "Must detach before reattaching." ); - if ( type == LOCAL_TO_GLOBAL ) - LogicError ( "Cannot update a constant matrix" ); + if( type == LOCAL_TO_GLOBAL ) + LogicError( "Cannot update a constant matrix" ); else { attachedForGlobalToLocal_ = true; globalToLocalMat_ = &X; } - const Int p = X.Grid ().Size (); + const Int p = X.Grid().Size(); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else // eom - sentEomTo_.resize ( p, false ); - haveEomFrom_.resize ( p, false ); - eomSendRequests_.resize ( p ); + sentEomTo_.resize( p, false ); + haveEomFrom_.resize( p, false ); + eomSendRequests_.resize( p ); #endif + // ready-to-send - sendingRequest_.resize ( p ); - sendingData_.resize ( p ); - sendingReply_.resize ( p ); + sendingRequest_.resize( p ); + sendingData_.resize( p ); + sendingReply_.resize( p ); + // ready-to-send - requestSendRequests_.resize ( p ); - replySendRequests_.resize ( p ); - dataSendRequests_.resize ( p ); + requestSendRequests_.resize( p ); + replySendRequests_.resize( p ); + dataSendRequests_.resize( p ); + // data - dataVectors_.resize ( p ); - replyVectors_.resize ( p ); - requestVectors_.resize ( p ); + dataVectors_.resize( p ); + replyVectors_.resize( p ); + requestVectors_.resize( p ); } template -void AxpyInterface ::Axpy ( T alpha, Matrix & Z, Int i, Int j ) +void AxpyInterface ::Axpy( T alpha, Matrix & Z, Int i, Int j ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Axpy" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Axpy" ) ) - if ( attachedForLocalToGlobal_ ) - AxpyLocalToGlobal ( alpha, Z, i, j ); - else if ( attachedForGlobalToLocal_ ) - AxpyGlobalToLocal ( alpha, Z, i, j ); + if( attachedForLocalToGlobal_ ) + AxpyLocalToGlobal( alpha, Z, i, j ); + else if( attachedForGlobalToLocal_ ) + AxpyGlobalToLocal( alpha, Z, i, j ); else - LogicError ( "Cannot axpy before attaching." ); + LogicError( "Cannot axpy before attaching." ); } template -void AxpyInterface ::Axpy ( T alpha, const Matrix & Z, Int i, - Int j ) +void AxpyInterface ::Axpy( T alpha, const Matrix & Z, Int i, + Int j ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Axpy" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Axpy" ) ) - if ( attachedForLocalToGlobal_ ) - AxpyLocalToGlobal ( alpha, Z, i, j ); - else if ( attachedForGlobalToLocal_ ) - LogicError ( "Cannot update a constant matrix." ); + if( attachedForLocalToGlobal_ ) + AxpyLocalToGlobal( alpha, Z, i, j ); + else if( attachedForGlobalToLocal_ ) + LogicError( "Cannot update a constant matrix." ); else - LogicError ( "Cannot axpy before attaching." ); + LogicError( "Cannot axpy before attaching." ); } // Update Y(i:i+height-1,j:j+width-1) += alpha X, where X is height x width @@ -490,173 +498,179 @@ template void AxpyInterface ::AxpyLocalToGlobal ( T alpha, const Matrix & X, Int i, Int j ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::AxpyLocalToGlobal" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::AxpyLocalToGlobal" ) ) DistMatrix & Y = *localToGlobalMat_; - if ( i < 0 || j < 0 ) - LogicError ( "Submatrix offsets must be non-negative" ); - - if ( i + X.Height () > Y.Height () || j + X.Width () > Y.Width () ) - LogicError ( "Submatrix out of bounds of global matrix" ); - - const Grid& g = Y.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - const Int myProcessRow = g.Row (); - const Int myProcessCol = g.Col (); - const Int colAlign = ( Y.ColAlign () + i ) % r; - const Int rowAlign = ( Y.RowAlign () + j ) % c; - const Int height = X.Height (); - const Int width = X.Width (); + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( i + X.Height() > Y.Height() || j + X.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int height = X.Height(); + const Int width = X.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - for ( Int step = 0; step < p; ++step ) + for( Int step = 0; step < p; ++step ) { - const Int colShift = Shift ( receivingRow, colAlign, r ); - const Int rowShift = Shift ( receivingCol, rowAlign, c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); + const Int colShift = Shift( receivingRow, colAlign, r ); + const Int rowShift = Shift( receivingCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - if ( numEntries != 0 ) + if( numEntries != 0 ) { const Int destination = receivingRow + r * receivingCol; const Int bufferSize = - 4 * sizeof ( Int ) + ( numEntries + 1 ) * sizeof ( T ); + 4 * sizeof( Int ) + ( numEntries + 1 ) * sizeof( T ); const Int index = - ReadyForSend ( bufferSize, dataVectors_[destination], - dataSendRequests_[destination], - sendingData_[destination] ); + ReadyForSend( bufferSize, dataVectors_[destination], + dataSendRequests_[destination], + sendingData_[destination] ); - DEBUG_ONLY ( if - ( Int ( dataVectors_[destination][index].size () ) != - bufferSize ) LogicError ( "Error in ReadyForSend" ); ) - // Pack the header - byte* sendBuffer = dataVectors_[destination][index].data (); + DEBUG_ONLY( if + ( Int( dataVectors_[destination][index].size() ) != + bufferSize ) LogicError( "Error in ReadyForSend" ); ) + + // Pack the header + byte* sendBuffer = dataVectors_[destination][index].data(); byte* head = sendBuffer; *reinterpret_cast ( head ) = i; - head += sizeof ( Int ); + head += sizeof( Int ); *reinterpret_cast ( head ) = j; - head += sizeof ( Int ); + head += sizeof( Int ); *reinterpret_cast ( head ) = height; - head += sizeof ( Int ); + head += sizeof( Int ); *reinterpret_cast ( head ) = width; - head += sizeof ( Int ); + head += sizeof( Int ); *reinterpret_cast ( head ) = alpha; - head += sizeof ( T ); - // Pack the payload - T* sendData = reinterpret_cast ( head ); - const T* XBuffer = X.LockedBuffer (); - const Int XLDim = X.LDim (); - - for ( Int t = 0; t < localWidth; ++t ) + head += sizeof( T ); + + // Pack the payload + T* sendData = reinterpret_cast ( head ); + const T* XBuffer = X.LockedBuffer(); + const Int XLDim = X.LDim(); + + for( Int t = 0; t < localWidth; ++t ) { T* thisSendCol = &sendData[t * localHeight]; const T* thisXCol = &XBuffer[( rowShift + t * c ) * XLDim]; - for ( Int s = 0; s < localHeight; ++s ) + for( Int s = 0; s < localHeight; ++s ) thisSendCol[s] = thisXCol[colShift + s * r]; } // Fire off the non-blocking send mpi::TaggedISSend - ( sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm (), + ( sendBuffer, bufferSize, destination, DATA_TAG, g.VCComm(), dataSendRequests_[destination][index] ); } receivingRow = ( receivingRow + 1 ) % r; - if ( receivingRow == 0 ) + if( receivingRow == 0 ) receivingCol = ( receivingCol + 1 ) % c; } } // Update Y += alpha X(i:i+height-1,j:j+width-1), where X is the dist-matrix template -void AxpyInterface ::AxpyGlobalToLocal ( T alpha, Matrix & Y, +void AxpyInterface ::AxpyGlobalToLocal( T alpha, Matrix & Y, Int i, Int j ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::AxpyGlobalToLocal" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::AxpyGlobalToLocal" ) ) const DistMatrix & X = *globalToLocalMat_; - const Int height = Y.Height (); - const Int width = Y.Width (); + const Int height = Y.Height(); + const Int width = Y.Width(); - if ( i + height > X.Height () || j + width > X.Width () ) - LogicError ( "Invalid AxpyGlobalToLocal submatrix" ); + if( i + height > X.Height() || j + width > X.Width() ) + LogicError( "Invalid AxpyGlobalToLocal submatrix" ); - const Grid& g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); + const Grid& g = X.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); // Send out the requests to all processes in the grid - for ( Int rank = 0; rank < p; ++rank ) + for( Int rank = 0; rank < p; ++rank ) { - const Int bufferSize = 4 * sizeof ( Int ); - const Int index = ReadyForSend ( bufferSize, requestVectors_[rank], - requestSendRequests_[rank], - sendingRequest_[rank] ); + const Int bufferSize = 4 * sizeof( Int ); + const Int index = ReadyForSend( bufferSize, requestVectors_[rank], + requestSendRequests_[rank], + sendingRequest_[rank] ); // Copy the request header into the send buffer - byte* sendBuffer = requestVectors_[rank][index].data (); + byte* sendBuffer = requestVectors_[rank][index].data(); byte* head = sendBuffer; *reinterpret_cast ( head ) = i; - head += sizeof ( Int ); + head += sizeof( Int ); *reinterpret_cast ( head ) = j; - head += sizeof ( Int ); + head += sizeof( Int ); *reinterpret_cast ( head ) = height; - head += sizeof ( Int ); + head += sizeof( Int ); *reinterpret_cast ( head ) = width; - head += sizeof ( Int ); + head += sizeof( Int ); + // Begin the non-blocking send mpi::TaggedISSend - ( sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm (), + ( sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm(), requestSendRequests_[rank][index] ); } // Receive all of the replies Int numReplies = 0; - - while ( numReplies < p ) + + while( numReplies < p ) { - HandleGlobalToLocalRequest (); + HandleGlobalToLocalRequest(); mpi::Status status; - if ( mpi::IProbe - ( mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm (), status ) ) + if( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm(), status ) ) { const Int source = status.MPI_SOURCE; // Ensure that we have a recv buffer const Int count = mpi::GetCount ( status ); - recvVector_.resize ( count ); - byte* recvBuffer = recvVector_.data (); - // Receive the data + recvVector_.resize( count ); + byte* recvBuffer = recvVector_.data(); + + // Receive the data mpi::TaggedRecv - ( recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm () ); - // Unpack the reply header + ( recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm() ); + + // Unpack the reply header const byte* head = recvBuffer; const Int row = *reinterpret_cast ( head ); - head += sizeof ( Int ); + head += sizeof( Int ); const Int col = *reinterpret_cast ( head ); - head += sizeof ( Int ); + head += sizeof( Int ); const T* recvData = reinterpret_cast ( head ); - // Compute the local heights and offsets - const Int colAlign = ( X.ColAlign () + i ) % r; - const Int rowAlign = ( X.RowAlign () + j ) % c; - const Int colShift = Shift ( row, colAlign, r ); - const Int rowShift = Shift ( col, rowAlign, c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); + + // Compute the local heights and offsets + const Int colAlign = ( X.ColAlign() + i ) % r; + const Int rowAlign = ( X.RowAlign() + j ) % c; + const Int colShift = Shift( row, colAlign, r ); + const Int rowShift = Shift( col, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); // Unpack the local matrix - for ( Int t = 0; t < localWidth; ++t ) + for( Int t = 0; t < localWidth; ++t ) { - T* YCol = Y.Buffer ( 0, rowShift + t * c ); + T* YCol = Y.Buffer( 0, rowShift + t * c ); const T* XCol = &recvData[t * localHeight]; - for ( Int s = 0; s < localHeight; ++s ) + for( Int s = 0; s < localHeight; ++s ) YCol[colShift + s * r] += alpha * XCol[s]; } @@ -672,78 +686,78 @@ Int AxpyInterface ::ReadyForSend std::deque & requests, std::deque & requestStatuses ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::ReadyForSend" ) ) - const Int numCreated = sendVectors.size (); + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::ReadyForSend" ) ) + const Int numCreated = sendVectors.size(); - DEBUG_ONLY ( if ( numCreated != Int ( requests.size () ) || - numCreated != Int ( requestStatuses.size () ) ) - LogicError( "size mismatch" ); ) - - for ( Int i = 0; i < numCreated; ++i ) + DEBUG_ONLY( if( numCreated != Int( requests.size() ) || + numCreated != Int( requestStatuses.size() ) ) + LogicError( "size mismatch" ); ) + + for( Int i = 0; i < numCreated; ++i ) { - // If this request is still running, test to see if it finished. - if ( requestStatuses[i] ) - { - const bool finished = mpi::Test ( requests[i] ); - requestStatuses[i] = !finished; + // If this request is still running, test to see if it finished. + if( requestStatuses[i] ) + { + const bool finished = mpi::Test( requests[i] ); + requestStatuses[i] = !finished; } - if ( !requestStatuses[i] ) + if( !requestStatuses[i] ) { - requestStatuses[i] = true; - sendVectors[i].resize ( sendSize ); + requestStatuses[i] = true; + sendVectors[i].resize( sendSize ); return i; } } - sendVectors.resize ( numCreated + 1 ); - sendVectors[numCreated].resize ( sendSize ); - requests.push_back ( mpi::REQUEST_NULL ); - requestStatuses.push_back ( true ); + sendVectors.resize( numCreated + 1 ); + sendVectors[numCreated].resize( sendSize ); + requests.push_back( mpi::REQUEST_NULL ); + requestStatuses.push_back( true ); return numCreated; } #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) -template bool AxpyInterface ::ReturnRequestStatuses () +template bool AxpyInterface ::ReturnRequestStatuses() { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::ReturnRequestStatuses" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::ReturnRequestStatuses" ) ) const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid () ); - const Int p = g.Size (); + localToGlobalMat_->Grid() : + globalToLocalMat_->Grid() ); + const Int p = g.Size(); - for ( Int i = 0; i < p; ++i ) + for( Int i = 0; i < p; ++i ) { - const Int numDataSendRequests = dataSendRequests_[i].size (); + const Int numDataSendRequests = dataSendRequests_[i].size(); - for ( Int j = 0; j < numDataSendRequests; ++j ) + for( Int j = 0; j < numDataSendRequests; ++j ) { - if ( sendingData_[i][j] ) - sendingData_[i][j] = !mpi::Test ( dataSendRequests_[i][j] ); + if( sendingData_[i][j] ) + sendingData_[i][j] = !mpi::Test( dataSendRequests_[i][j] ); - if ( sendingData_[i][j] ) + if( sendingData_[i][j] ) return false; } - const Int numRequestSendRequests = requestSendRequests_[i].size (); + const Int numRequestSendRequests = requestSendRequests_[i].size(); - for ( Int j = 0; j < numRequestSendRequests; ++j ) + for( Int j = 0; j < numRequestSendRequests; ++j ) { - if ( sendingRequest_[i][j] ) - sendingRequest_[i][j] = !mpi::Test ( requestSendRequests_[i][j] ); + if( sendingRequest_[i][j] ) + sendingRequest_[i][j] = !mpi::Test( requestSendRequests_[i][j] ); - if ( sendingRequest_[i][j] ) + if( sendingRequest_[i][j] ) return false; } - const Int numReplySendRequests = replySendRequests_[i].size (); + const Int numReplySendRequests = replySendRequests_[i].size(); - for ( Int j = 0; j < numReplySendRequests; ++j ) + for( Int j = 0; j < numReplySendRequests; ++j ) { - if ( sendingReply_[i][j] ) - sendingReply_[i][j] = !mpi::Test ( replySendRequests_[i][j] ); + if( sendingReply_[i][j] ) + sendingReply_[i][j] = !mpi::Test( replySendRequests_[i][j] ); - if ( sendingReply_[i][j] ) + if( sendingReply_[i][j] ) return false; } } @@ -751,47 +765,47 @@ template bool AxpyInterface ::ReturnRequestStatuses () return true; } #else -template void AxpyInterface ::UpdateRequestStatuses () +template void AxpyInterface ::UpdateRequestStatuses() { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::UpdateRequestStatuses" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::UpdateRequestStatuses" ) ) const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : - globalToLocalMat_->Grid () ); - const Int p = g.Size (); + localToGlobalMat_->Grid() : + globalToLocalMat_->Grid() ); + const Int p = g.Size(); - for ( Int i = 0; i < p; ++i ) + for( Int i = 0; i < p; ++i ) { - const Int numDataSendRequests = dataSendRequests_[i].size (); + const Int numDataSendRequests = dataSendRequests_[i].size(); - for ( Int j = 0; j < numDataSendRequests; ++j ) - if ( sendingData_[i][j] ) - sendingData_[i][j] = !mpi::Test ( dataSendRequests_[i][j] ); + for( Int j = 0; j < numDataSendRequests; ++j ) + if( sendingData_[i][j] ) + sendingData_[i][j] = !mpi::Test( dataSendRequests_[i][j] ); - const Int numRequestSendRequests = requestSendRequests_[i].size (); + const Int numRequestSendRequests = requestSendRequests_[i].size(); - for ( Int j = 0; j < numRequestSendRequests; ++j ) - if ( sendingRequest_[i][j] ) - sendingRequest_[i][j] = !mpi::Test ( requestSendRequests_[i][j] ); + for( Int j = 0; j < numRequestSendRequests; ++j ) + if( sendingRequest_[i][j] ) + sendingRequest_[i][j] = !mpi::Test( requestSendRequests_[i][j] ); - const Int numReplySendRequests = replySendRequests_[i].size (); + const Int numReplySendRequests = replySendRequests_[i].size(); - for ( Int j = 0; j < numReplySendRequests; ++j ) - if ( sendingReply_[i][j] ) - sendingReply_[i][j] = !mpi::Test ( replySendRequests_[i][j] ); + for( Int j = 0; j < numReplySendRequests; ++j ) + if( sendingReply_[i][j] ) + sendingReply_[i][j] = !mpi::Test( replySendRequests_[i][j] ); } } #endif //EL_USE_IBARRIER_FOR_AXPY -template void AxpyInterface ::Detach () +template void AxpyInterface ::Detach() { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::Detach" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Detach" ) ) - if ( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) - LogicError ( "Must attach before detaching." ); + if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) + LogicError( "Must attach before detaching." ); const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid () : globalToLocalMat_-> - Grid () ); + localToGlobalMat_->Grid() : globalToLocalMat_-> + Grid() ); // nonblocking consensus #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) bool DONE = false; @@ -809,39 +823,44 @@ template void AxpyInterface ::Detach () HandleGlobalToLocalRequest(); #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) - if ( nb_bar_active ) - DONE = mpi::Test ( nb_bar_request ); + + if( nb_bar_active ) + DONE = mpi::Test( nb_bar_request ); else { - if ( ReturnRequestStatuses() ) + if( ReturnRequestStatuses() ) { // all ssends are complete, start nonblocking barrier - mpi::IBarrier ( g.VCComm (), nb_bar_request ); + mpi::IBarrier( g.VCComm(), nb_bar_request ); nb_bar_active = true; } } - #else HandleEoms(); #endif } - mpi::Barrier ( g.VCComm () ); + mpi::Barrier( g.VCComm() ); + attachedForLocalToGlobal_ = false; attachedForGlobalToLocal_ = false; - recvVector_.clear(); + #if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) #else sentEomTo_.clear(); haveEomFrom_.clear(); eomSendRequests_.clear(); #endif + sendingData_.clear(); sendingRequest_.clear(); sendingReply_.clear(); + dataVectors_.clear(); requestVectors_.clear(); replyVectors_.clear(); + recvVector_.clear(); + dataSendRequests_.clear(); requestSendRequests_.clear(); replySendRequests_.clear(); diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp index ba27ec920d..5c0f8452b9 100644 --- a/src/core/AxpyInterface2.0.cpp +++ b/src/core/AxpyInterface2.0.cpp @@ -32,10 +32,13 @@ template AxpyInterface2::AxpyInterface2( const DistMatrix& Z ) { DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::AxpyInterface2" ) ) + attached_ = false; detached_ = true; + toBeAttachedForGet_ = false; toBeAttachedForPut_ = false; + GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; } @@ -60,14 +63,14 @@ AxpyInterface2::~AxpyInterface2() } template -Int AxpyInterface2::NextIndexData ( +Int AxpyInterface2::NextIndexData( Int target, Int dataSize, const void* base_address, Int* mindex ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface2::NextIndexData" ) ) - assert ( base_address != NULL ); + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::NextIndexData" ) ) + assert( base_address != NULL ); Int matrixIndex = 0; const Grid& g = ( toBeAttachedForPut_ ? GlobalArrayPut_->Grid() : @@ -76,16 +79,16 @@ Int AxpyInterface2::NextIndexData ( const Int numMatrices = matrices_.size(); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; } // uninitiated, first time - if ( matrices_[m].base_ == NULL ) + if( matrices_[m].base_ == NULL ) { matrices_[m].base_ = base_address; matrixIndex = m; @@ -96,7 +99,7 @@ Int AxpyInterface2::NextIndexData ( } // need to create new object - if ( matrixIndex == numMatrices ) + if( matrixIndex == numMatrices ) { struct matrix_params_ mp; mp.data_.resize( p ); @@ -111,47 +114,48 @@ Int AxpyInterface2::NextIndexData ( // go through the request, data, // status objects - const Int numCreated = matrices_[matrixIndex].data_[target].size (); + const Int numCreated = matrices_[matrixIndex].data_[target].size(); - DEBUG_ONLY ( if ( numCreated != Int ( matrices_[matrixIndex].requests_[target].size () ) - || numCreated != Int ( matrices_[matrixIndex].statuses_[target].size () ) ) - LogicError ( "size mismatch" ); ) - for ( Int i = 0; i < numCreated; ++i ) - { - // If this request is still running, - // test to see if it finished. - if ( matrices_[matrixIndex].statuses_[target][i] ) - { - const bool finished = mpi::Test ( matrices_[matrixIndex].requests_[target][i] ); - matrices_[matrixIndex].statuses_[target][i] = !finished; - } + DEBUG_ONLY( if( numCreated != Int( matrices_[matrixIndex].requests_[target].size() ) + || numCreated != Int( matrices_[matrixIndex].statuses_[target].size() ) ) + LogicError( "size mismatch" ); ) - if ( !matrices_[matrixIndex].statuses_[target][i] ) - { - matrices_[matrixIndex].statuses_[target][i] = true; - matrices_[matrixIndex].data_[target][i].resize ( dataSize ); - *mindex = matrixIndex; - return i; - } + for( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, + // test to see if it finished. + if( matrices_[matrixIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test( matrices_[matrixIndex].requests_[target][i] ); + matrices_[matrixIndex].statuses_[target][i] = !finished; + } + + if( !matrices_[matrixIndex].statuses_[target][i] ) + { + matrices_[matrixIndex].statuses_[target][i] = true; + matrices_[matrixIndex].data_[target][i].resize( dataSize ); + *mindex = matrixIndex; + return i; } + } - matrices_[matrixIndex].data_[target].resize ( numCreated + 1 ); - matrices_[matrixIndex].data_[target][numCreated].resize ( dataSize ); - matrices_[matrixIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); - matrices_[matrixIndex].statuses_[target].push_back ( true ); + matrices_[matrixIndex].data_[target].resize( numCreated + 1 ); + matrices_[matrixIndex].data_[target][numCreated].resize( dataSize ); + matrices_[matrixIndex].requests_[target].push_back( mpi::REQUEST_NULL ); + matrices_[matrixIndex].statuses_[target].push_back( true ); *mindex = matrixIndex; return numCreated; } template -Int AxpyInterface2::NextIndexCoord ( +Int AxpyInterface2::NextIndexCoord( Int i, Int j, Int target, const void* base_address, Int* cindex ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface2::NextIndexCoord" ) ) - assert ( base_address != NULL ); + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::NextIndexCoord" ) ) + assert( base_address != NULL ); Int coordIndex = 0; const Grid& g = ( toBeAttachedForPut_ ? GlobalArrayPut_->Grid() : @@ -160,15 +164,15 @@ Int AxpyInterface2::NextIndexCoord ( const Int numCoords = coords_.size(); // search for matrix base - for ( Int m = 0; m < numCoords; m++ ) + for( Int m = 0; m < numCoords; m++ ) { - if ( coords_[m].base_ == base_address ) + if( coords_[m].base_ == base_address ) { coordIndex = m; break; } - if ( coords_[m].base_ == NULL ) + if( coords_[m].base_ == NULL ) { coords_[m].base_ = base_address; coordIndex = m; @@ -179,7 +183,7 @@ Int AxpyInterface2::NextIndexCoord ( } // need to create new object - if ( coordIndex == numCoords ) + if( coordIndex == numCoords ) { struct coord_params_ cp; cp.coord_.resize( p ); @@ -194,36 +198,37 @@ Int AxpyInterface2::NextIndexCoord ( // go through the request, data, // status objects - const Int numCreated = coords_[coordIndex].coord_[target].size (); + const Int numCreated = coords_[coordIndex].coord_[target].size(); - DEBUG_ONLY ( if ( numCreated != Int ( coords_[coordIndex].requests_[target].size () ) - || numCreated != Int ( matrices_[coordIndex].statuses_[target].size () ) ) - LogicError ( "size mismatch" ); ) - for ( Int i = 0; i < numCreated; ++i ) - { - // If this request is still running, - // test to see if it finished. - if ( coords_[coordIndex].statuses_[target][i] ) - { - const bool finished = mpi::Test ( coords_[coordIndex].requests_[target][i] ); - coords_[coordIndex].statuses_[target][i] = !finished; - } + DEBUG_ONLY( if( numCreated != Int( coords_[coordIndex].requests_[target].size() ) + || numCreated != Int( matrices_[coordIndex].statuses_[target].size() ) ) + LogicError( "size mismatch" ); ) + + for( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, + // test to see if it finished. + if( coords_[coordIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test( coords_[coordIndex].requests_[target][i] ); + coords_[coordIndex].statuses_[target][i] = !finished; + } - if ( !coords_[coordIndex].statuses_[target][i] ) - { - coords_[coordIndex].statuses_[target][i] = true; - coords_[coordIndex].coord_[target][i][0] = i; - coords_[coordIndex].coord_[target][i][1] = j; - *cindex = coordIndex; - return i; - } + if( !coords_[coordIndex].statuses_[target][i] ) + { + coords_[coordIndex].statuses_[target][i] = true; + coords_[coordIndex].coord_[target][i][0] = i; + coords_[coordIndex].coord_[target][i][1] = j; + *cindex = coordIndex; + return i; } + } - coords_[coordIndex].coord_[target].resize ( numCreated + 1 ); + coords_[coordIndex].coord_[target].resize( numCreated + 1 ); coords_[coordIndex].coord_[target][numCreated][0] = i; coords_[coordIndex].coord_[target][numCreated][1] = j; - coords_[coordIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); - coords_[coordIndex].statuses_[target].push_back ( true ); + coords_[coordIndex].requests_[target].push_back( mpi::REQUEST_NULL ); + coords_[coordIndex].statuses_[target].push_back( true ); *cindex = coordIndex; return numCreated; } @@ -235,7 +240,7 @@ void AxpyInterface2::Attach( DistMatrix& Z ) // attached_ will be only set in Attach // and only unset in Detach - if ( !attached_ && detached_ ) + if( !attached_ && detached_ ) { attached_ = true; detached_ = false; @@ -244,7 +249,7 @@ void AxpyInterface2::Attach( DistMatrix& Z ) LogicError( "Must detach before reattaching." ); const Grid& g = Z.Grid(); - const Int p = g.Size (); + const Int p = g.Size(); // the matrix base_ is not known until // an update operation (put/get/acc) @@ -258,10 +263,10 @@ void AxpyInterface2::Attach( DistMatrix& Z ) GlobalArrayGet_ = &Z; toBeAttachedForGet_ = true; - if ( dataVectors_.empty() ) + if( dataVectors_.empty() ) dataVectors_.resize( p ); - if ( matrices_.empty() ) + if( matrices_.empty() ) { struct matrix_params_ mp; mp.data_.resize( p ); @@ -273,7 +278,7 @@ void AxpyInterface2::Attach( DistMatrix& Z ) matrices_.push_back( mp ); } - if ( coords_.empty() ) + if( coords_.empty() ) { struct coord_params_ cp; cp.coord_.resize( p ); @@ -286,7 +291,7 @@ void AxpyInterface2::Attach( DistMatrix& Z ) } } - mpi::Barrier ( g.VCComm() ); + mpi::Barrier( g.VCComm() ); } template @@ -296,7 +301,7 @@ void AxpyInterface2::Attach( const DistMatrix& Z ) // attached_ will be only set in Attach // and only unset in Detach - if ( !attached_ && detached_ ) + if( !attached_ && detached_ ) { attached_ = true; detached_ = false; @@ -305,7 +310,7 @@ void AxpyInterface2::Attach( const DistMatrix& Z ) LogicError( "Must detach before reattaching." ); const Grid& g = Z.Grid(); - const Int p = g.Size (); + const Int p = g.Size(); // the matrix base_ is not known until // an update operation (put/get/acc) @@ -319,7 +324,7 @@ void AxpyInterface2::Attach( const DistMatrix& Z ) GlobalArrayGet_ = &Z; toBeAttachedForGet_ = true; - if ( matrices_.empty() ) + if( matrices_.empty() ) { struct matrix_params_ mp; mp.data_.resize( p ); @@ -331,7 +336,7 @@ void AxpyInterface2::Attach( const DistMatrix& Z ) matrices_.push_back( mp ); } - if ( coords_.empty() ) + if( coords_.empty() ) { struct coord_params_ cp; cp.coord_.resize( p ); @@ -344,7 +349,7 @@ void AxpyInterface2::Attach( const DistMatrix& Z ) } } - mpi::Barrier ( g.VCComm() ); + mpi::Barrier( g.VCComm() ); } // end-to-end blocking put/acc routines @@ -356,7 +361,7 @@ void AxpyInterface2::Eput( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -392,20 +397,20 @@ void AxpyInterface2::Eput( const Matrix& Z, Int i, Int j ) const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - if( numEntries > 0 ) + if( numEntries > 0 ) { const Int destination = receivingRow + r*receivingCol; // data const Int dindex = - NextIndexData ( destination, - numEntries, - Buffer, - &matrix_index ); + NextIndexData( destination, + numEntries, + Buffer, + &matrix_index ); - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[destination][dindex].size () ) != - numEntries ) LogicError ( "Error in NextIndexData" ); ) - T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][dindex].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data(); for( Int t=0; t::Eput( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; } - mpi::TaggedISend ( sendBuffer, numEntries, destination, - DATA_PUT_TAG, g.VCComm(), - matrices_[matrix_index].requests_[destination][dindex] ); + mpi::TaggedISend( sendBuffer, numEntries, destination, + DATA_PUT_TAG, g.VCComm(), + matrices_[matrix_index].requests_[destination][dindex] ); // coordinates const Int cindex = - NextIndexCoord ( i, j, - destination, - Buffer, - &coord_index ); + NextIndexCoord( i, j, + destination, + Buffer, + &coord_index ); Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); coord_[0] = i; coord_[1] = j; coord_[2] = numEntries; // post receive for coordinates - mpi::TaggedISend ( coord_, 3, destination, - COORD_PUT_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex] ); + mpi::TaggedISend( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); } receivingRow = ( receivingRow + 1 ) % r; @@ -442,59 +447,57 @@ void AxpyInterface2::Eput( const Matrix& Z, Int i, Int j ) } // poke - Test ( Z ); + Test( Z ); // data/coord receive std::vector recvVector_; - for ( Int step=0; step void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) -{ - Eput( const_cast&>( Z ), i, j ); -} +{ Eput( const_cast&>( Z ), i, j ); } // end to end blocking routines template @@ -505,7 +508,7 @@ void AxpyInterface2::Eacc( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -543,20 +546,21 @@ void AxpyInterface2::Eacc( const Matrix& Z, Int i, Int j ) const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - if( numEntries > 0 ) + if( numEntries > 0 ) { const Int destination = receivingRow + r*receivingCol; // data const Int dindex = - NextIndexData ( destination, - numEntries, - Buffer, - &matrix_index ); + NextIndexData( destination, + numEntries, + Buffer, + &matrix_index ); - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[destination][dindex].size () ) != - numEntries ) LogicError ( "Error in NextIndexData" ); ) - T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][dindex].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data(); for( Int t=0; t::Eacc( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; } - mpi::TaggedISend ( sendBuffer, numEntries, destination, - DATA_ACC_TAG, g.VCComm(), - matrices_[matrix_index].requests_[destination][dindex] ); + mpi::TaggedISend( sendBuffer, numEntries, destination, + DATA_ACC_TAG, g.VCComm(), + matrices_[matrix_index].requests_[destination][dindex] ); // coordinates const Int cindex = - NextIndexCoord ( i, j, - destination, - Buffer, - &coord_index ); + NextIndexCoord( i, j, + destination, + Buffer, + &coord_index ); Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); coord_[0] = i; coord_[1] = j; coord_[2] = numEntries; // post receive for coordinates - mpi::TaggedISend ( coord_, 3, destination, - COORD_ACC_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex] ); + mpi::TaggedISend( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); } receivingRow = ( receivingRow + 1 ) % r; @@ -593,59 +597,57 @@ void AxpyInterface2::Eacc( const Matrix& Z, Int i, Int j ) } // test for requests - Test ( Z ); + Test( Z ); - for ( Int step=0; step void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) -{ - Eacc( const_cast&>( Z ), i, j ); -} +{ Eacc( const_cast&>( Z ), i, j ); } template void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) @@ -655,79 +657,81 @@ void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) // a call to Attach with a non-const DistMatrix must set // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ( "Cannot perform this operation as matrix is not attached." ); + if( !toBeAttachedForGet_ ) + LogicError( "Cannot perform this operation as matrix is not attached." ); const DistMatrix& X = *GlobalArrayGet_; - const Int height = Z.Height (); - const Int width = Z.Width (); + const Int height = Z.Height(); + const Int width = Z.Width(); - if ( i + height > X.Height () || j + width > X.Width () ) - LogicError ( "Invalid submatrix for Iget" ); + if( i + height > X.Height() || j + width > X.Width() ) + LogicError( "Invalid submatrix for Iget" ); T* XBuffer = Z.Buffer(); const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); - const Grid& g = X.Grid (); - const Int p = g.Size (); - const Int r = g.Height (); - const Int c = g.Width (); + const Grid& g = X.Grid(); + const Int p = g.Size(); + const Int r = g.Height(); + const Int c = g.Width(); Int coord_index; std::vector recvVector_; // Send out the requests to all processes in the grid - for ( Int rank = 0; rank < p; ++rank ) + for( Int rank = 0; rank < p; ++rank ) { const Int cindex = - NextIndexCoord ( i, j, - rank, - Buffer, - &coord_index ); - Int* coord = reinterpret_cast( coords_[coord_index].coord_[rank][cindex].data () ); + NextIndexCoord( i, j, + rank, + Buffer, + &coord_index ); + Int* coord = reinterpret_cast( coords_[coord_index].coord_[rank][cindex].data() ); coord[0] = i; coord[1] = j; coord[2] = -1; - mpi::TaggedISend ( coord, 3, rank, - REQUEST_GET_TAG, g.VCComm (), - coords_[coord_index].requests_[rank][cindex] ); + mpi::TaggedISend( coord, 3, rank, + REQUEST_GET_TAG, g.VCComm(), + coords_[coord_index].requests_[rank][cindex] ); } // Receive all of the replies Int numReplies = 0; - while ( numReplies < p ) + while( numReplies < p ) { mpi::Status status; - HandleGlobalToLocalData ( Z ); + HandleGlobalToLocalData( Z ); - if ( mpi::IProbe - ( mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status ) ) + if( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm(), status ) ) { const Int source = status.MPI_SOURCE; // Ensure that we have a recv buffer const Int count = mpi::GetCount ( status ); - recvVector_.resize ( count ); - T* recvBuffer = recvVector_.data (); - // Receive the data + recvVector_.resize( count ); + T* recvBuffer = recvVector_.data(); + + // Receive the data mpi::TaggedRecv - ( recvBuffer, count, source, DATA_GET_TAG, g.VCComm () ); - // Compute the local heights and offsets - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int colAlign = ( X.ColAlign () + i ) % r; - const Int rowAlign = ( X.RowAlign () + j ) % c; - const Int colShift = Shift ( myRow, colAlign, r ); - const Int rowShift = Shift ( myCol, rowAlign, c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); + ( recvBuffer, count, source, DATA_GET_TAG, g.VCComm() ); + + // Compute the local heights and offsets + const Int myRow = g.Row(); + const Int myCol = g.Col(); + const Int colAlign = ( X.ColAlign() + i ) % r; + const Int rowAlign = ( X.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); // Unpack the local matrix - for ( Int t = 0; t < localWidth; ++t ) + for( Int t = 0; t < localWidth; ++t ) { //T *YCol = X.Buffer (0, rowShift + t * c); - T* YCol = Z.Buffer ( 0, rowShift + t * c ); + T* YCol = Z.Buffer( 0, rowShift + t * c ); const T* XCol = &recvBuffer[t * localHeight]; - for ( Int s = 0; s < localHeight; ++s ) + for( Int s = 0; s < localHeight; ++s ) YCol[colShift + s * r] = XCol[s]; } @@ -746,7 +750,7 @@ void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -770,7 +774,7 @@ void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) const Int width = Z.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int YLDim = Y.LDim (); + const Int YLDim = Y.LDim(); const T* XBuffer = Z.LockedBuffer(); const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); @@ -787,15 +791,15 @@ void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int dindex = - NextIndexData ( destination, - numEntries, - Buffer, - &matrix_index ); + NextIndexData( destination, + numEntries, + Buffer, + &matrix_index ); - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[destination][dindex].size () ) != - numEntries ) LogicError ( "Error in NextIndexData" ); ) - T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][dindex].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data(); for( Int t=0; t::Iput( const Matrix& Z, Int i, Int j ) } // put request - mpi::TaggedISend ( sendBuffer, numEntries, destination, - DATA_PUT_TAG, g.VCComm (), - matrices_[matrix_index].requests_[destination][dindex] ); + mpi::TaggedISend( sendBuffer, numEntries, destination, + DATA_PUT_TAG, g.VCComm(), + matrices_[matrix_index].requests_[destination][dindex] ); // send coordinates const Int cindex = - NextIndexCoord ( i, j, - destination, - Buffer, - &coord_index ); + NextIndexCoord( i, j, + destination, + Buffer, + &coord_index ); Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); coord_[0] = i; coord_[1] = j; coord_[2] = numEntries; // post receive for coordinates - mpi::TaggedISend ( coord_, 3, destination, - COORD_PUT_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex] ); + mpi::TaggedISend( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); } receivingRow = ( receivingRow + 1 ) % r; @@ -835,9 +839,7 @@ void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) template void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) -{ - Iput( const_cast&>( Z ), i, j ); -} +{ Iput( const_cast&>( Z ), i, j ); } template void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) @@ -847,38 +849,38 @@ void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) // a call to Attach with a non-const DistMatrix must set // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ( "Cannot perform this operation as matrix is not attached." ); + if( !toBeAttachedForGet_ ) + LogicError( "Cannot perform this operation as matrix is not attached." ); const DistMatrix& X = *GlobalArrayGet_; - const Int height = Z.Height (); - const Int width = Z.Width (); + const Int height = Z.Height(); + const Int width = Z.Width(); const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); Int coord_index; - if ( i + height > X.Height () || j + width > X.Width () ) - LogicError ( "Invalid submatrix for Iget" ); + if( i + height > X.Height() || j + width > X.Width() ) + LogicError( "Invalid submatrix for Iget" ); - const Grid& g = X.Grid (); - const Int p = g.Size (); + const Grid& g = X.Grid(); + const Int p = g.Size(); // Send out the requests to all processes in the grid - for ( Int rank = 0; rank < p; ++rank ) + for( Int rank = 0; rank < p; ++rank ) { // send coordinates const Int cindex = - NextIndexCoord ( i, j, - rank, - Buffer, - &coord_index ); + NextIndexCoord( i, j, + rank, + Buffer, + &coord_index ); Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[rank][cindex].data() ); coord_[0] = i; coord_[1] = j; coord_[2] = -1; // post receive for coordinates - mpi::TaggedISend ( coord_, 3, rank, - REQUEST_GET_TAG, g.VCComm(), - coords_[coord_index].requests_[rank][cindex] ); + mpi::TaggedISend( coord_, 3, rank, + REQUEST_GET_TAG, g.VCComm(), + coords_[coord_index].requests_[rank][cindex] ); } } @@ -892,7 +894,7 @@ void AxpyInterface2::Iacc( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -929,19 +931,19 @@ void AxpyInterface2::Iacc( const Matrix& Z, Int i, Int j ) const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - if( numEntries != 0 ) + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; const Int dindex = - NextIndexData ( destination, - numEntries, - Buffer, - &matrix_index ); + NextIndexData( destination, + numEntries, + Buffer, + &matrix_index ); - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[destination][dindex].size () ) != - numEntries ) LogicError ( "Error in NextIndexData" ); ) - T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][dindex].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data(); for( Int t=0; t::Iacc( const Matrix& Z, Int i, Int j ) } // acc request - mpi::TaggedISend ( sendBuffer, numEntries, destination, - DATA_ACC_TAG, g.VCComm (), - matrices_[matrix_index].requests_[destination][dindex] ); + mpi::TaggedISend( sendBuffer, numEntries, destination, + DATA_ACC_TAG, g.VCComm(), + matrices_[matrix_index].requests_[destination][dindex] ); // send coordinates const Int cindex = - NextIndexCoord ( i, j, - destination, - Buffer, - &coord_index ); + NextIndexCoord( i, j, + destination, + Buffer, + &coord_index ); Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); coord_[0] = i; coord_[1] = j; coord_[2] = numEntries; - mpi::TaggedISend ( coord_, 3, destination, - COORD_ACC_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex] ); + mpi::TaggedISend( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); } receivingRow = ( receivingRow + 1 ) % r; @@ -980,9 +982,7 @@ void AxpyInterface2::Iacc( const Matrix& Z, Int i, Int j ) template void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) -{ - Iacc( const_cast&>( Z ), i, j ); -} +{ Iacc( const_cast&>( Z ), i, j ); } // nonblocking, local completion template @@ -993,7 +993,7 @@ void AxpyInterface2::Put( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -1012,21 +1012,24 @@ void AxpyInterface2::Put( const Matrix& Z, Int i, Int j ) const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); Int matrix_index, coord_index; + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int YLDim = Y.LDim (); + const Int YLDim = Y.LDim(); + // copy local matrix buffer const Int my_rank = g.VCRank(); - const Int numCreated = dataVectors_[my_rank].size (); - dataVectors_[my_rank].resize ( numCreated + 1 ); - dataVectors_[my_rank][numCreated].resize ( width * height ); + const Int numCreated = dataVectors_[my_rank].size(); + dataVectors_[my_rank].resize( numCreated + 1 ); + dataVectors_[my_rank][numCreated].resize( width * height ); + const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); T* ZBuffer = reinterpret_cast ( dataVectors_[my_rank][numCreated].data() ); - MemCopy ( ZBuffer, reinterpret_cast ( Buffer ), - height * width ); + MemCopy( ZBuffer, reinterpret_cast ( Buffer ), + height * width ); T* XBuffer = reinterpret_cast ( ZBuffer ); for( Int step=0; step::Put( const Matrix& Z, Int i, Int j ) const Int destination = receivingRow + r*receivingCol; // data const Int dindex = - NextIndexData ( destination, - numEntries, - Buffer, - &matrix_index ); + NextIndexData( destination, + numEntries, + Buffer, + &matrix_index ); - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[destination][dindex].size () ) != - numEntries ) LogicError ( "Error in NextIndexData" ); ) - T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][dindex].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data(); for( Int t=0; t::Put( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; } - mpi::TaggedISend ( sendBuffer, numEntries, destination, - DATA_PUT_TAG, g.VCComm (), - matrices_[matrix_index].requests_[destination][dindex] ); + mpi::TaggedISend( sendBuffer, numEntries, destination, + DATA_PUT_TAG, g.VCComm(), + matrices_[matrix_index].requests_[destination][dindex] ); // send coordinates const Int cindex = - NextIndexCoord ( i, j, - destination, - Buffer, - &coord_index ); + NextIndexCoord( i, j, + destination, + Buffer, + &coord_index ); Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); coord_[0] = i; coord_[1] = j; coord_[2] = numEntries; - mpi::TaggedISend ( coord_, 3, destination, - COORD_PUT_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex] ); + mpi::TaggedISend( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); } receivingRow = ( receivingRow + 1 ) % r; @@ -1089,9 +1092,7 @@ void AxpyInterface2::Put( const Matrix& Z, Int i, Int j ) template void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) -{ - Put( const_cast&>( Z ), i, j ); -} +{ Put( const_cast&>( Z ), i, j ); } // input buffer could be modified upon exit // from this function @@ -1103,7 +1104,7 @@ void AxpyInterface2::Acc( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -1122,21 +1123,24 @@ void AxpyInterface2::Acc( const Matrix& Z, Int i, Int j ) const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; const Int YLDim = Y.LDim(); + // copy local matrix buffer const Int my_rank = g.VCRank(); - const Int numCreated = dataVectors_[my_rank].size (); - dataVectors_[my_rank].resize ( numCreated + 1 ); - dataVectors_[my_rank][numCreated].resize ( width * height ); + const Int numCreated = dataVectors_[my_rank].size(); + dataVectors_[my_rank].resize( numCreated + 1 ); + dataVectors_[my_rank][numCreated].resize( width * height ); + const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); T* ZBuffer = reinterpret_cast ( dataVectors_[my_rank][numCreated].data() ); - MemCopy ( ZBuffer, reinterpret_cast ( Buffer ), - height * width ); + MemCopy( ZBuffer, reinterpret_cast ( Buffer ), + height * width ); T* XBuffer = reinterpret_cast ( ZBuffer ); for( Int step=0; step::Acc( const Matrix& Z, Int i, Int j ) const Int localWidth = Length( width, rowShift, c ); const Int numEntries = localHeight * localWidth; - if( numEntries != 0 ) + if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; // data const Int dindex = - NextIndexData ( destination, - numEntries, - Buffer, - &matrix_index ); + NextIndexData( destination, + numEntries, + Buffer, + &matrix_index ); - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[destination][dindex].size () ) != - numEntries ) LogicError ( "Error in NextIndexData" ); ) - T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data (); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][dindex].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data(); for( Int t=0; t::Acc( const Matrix& Z, Int i, Int j ) } // acc request - mpi::TaggedISend ( sendBuffer, numEntries, destination, - DATA_ACC_TAG, g.VCComm (), - matrices_[matrix_index].requests_[destination][dindex] ); + mpi::TaggedISend( sendBuffer, numEntries, destination, + DATA_ACC_TAG, g.VCComm(), + matrices_[matrix_index].requests_[destination][dindex] ); // send coordinates const Int cindex = - NextIndexCoord ( i, j, - destination, - Buffer, - &coord_index ); + NextIndexCoord( i, j, + destination, + Buffer, + &coord_index ); Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[destination][cindex].data() ); coord_[0] = i; coord_[1] = j; coord_[2] = numEntries; - mpi::TaggedISend ( coord_, 3, destination, - COORD_ACC_TAG, g.VCComm(), - coords_[coord_index].requests_[destination][cindex] ); + mpi::TaggedISend( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); } receivingRow = ( receivingRow + 1 ) % r; @@ -1200,9 +1204,7 @@ void AxpyInterface2::Acc( const Matrix& Z, Int i, Int j ) template void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) -{ - Acc( const_cast&>( Z ), i, j ); -} +{ Acc( const_cast&>( Z ), i, j ); } // waitany implementation // cannot use mpi::Waitany @@ -1226,9 +1228,9 @@ void AxpyInterface2::WaitAny( const Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -1238,9 +1240,9 @@ void AxpyInterface2::WaitAny( const Matrix& Z ) } // search for matrix base in coords - for ( Int c = 0; c < numCoords; c++ ) + for( Int c = 0; c < numCoords; c++ ) { - if ( coords_[c].base_ == base_address ) + if( coords_[c].base_ == base_address ) { coordIndex = c; break; @@ -1250,23 +1252,23 @@ void AxpyInterface2::WaitAny( const Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices && - coordIndex == numCoords ) + if( matrixIndex == numMatrices && + coordIndex == numCoords ) return; // data - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { - if ( !matrices_[matrixIndex].statuses_[rank][i] ) + if( !matrices_[matrixIndex].statuses_[rank][i] ) { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); matrices_[matrixIndex].statuses_[rank][i] = true; return; } @@ -1274,18 +1276,18 @@ void AxpyInterface2::WaitAny( const Matrix& Z ) } // coordinates - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + if( coords_[coordIndex].statuses_[rank].size() == 0 ) continue; - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); - for ( int i = 0; i < numCoordStatuses; i++ ) + for( int i = 0; i < numCoordStatuses; i++ ) { - if ( !coords_[coordIndex].statuses_[rank][i] ) + if( !coords_[coordIndex].statuses_[rank][i] ) { - mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + mpi::Wait( coords_[coordIndex].requests_[rank][i] ); coords_[coordIndex].statuses_[rank][i] = true; return; } @@ -1295,9 +1297,7 @@ void AxpyInterface2::WaitAny( const Matrix& Z ) template void AxpyInterface2::WaitAny( Matrix& Z ) -{ - WaitAny( const_cast&>( Z ) ); -} +{ WaitAny( const_cast&>( Z ) ); } template void AxpyInterface2::Wait( const Matrix& Z ) @@ -1317,9 +1317,9 @@ void AxpyInterface2::Wait( const Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -1329,9 +1329,9 @@ void AxpyInterface2::Wait( const Matrix& Z ) } // search for matrix base in coords - for ( Int c = 0; c < numCoords; c++ ) + for( Int c = 0; c < numCoords; c++ ) { - if ( coords_[c].base_ == base_address ) + if( coords_[c].base_ == base_address ) { coordIndex = c; break; @@ -1341,36 +1341,36 @@ void AxpyInterface2::Wait( const Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices && - coordIndex == numCoords ) + if( matrixIndex == numMatrices && + coordIndex == numCoords ) return; // data - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); matrices_[matrixIndex].statuses_[rank][i] = true; } } // coordinates - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + if( coords_[coordIndex].statuses_[rank].size() == 0 ) continue; - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); - for ( int i = 0; i < numCoordStatuses; i++ ) + for( int i = 0; i < numCoordStatuses; i++ ) { - mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + mpi::Wait( coords_[coordIndex].requests_[rank][i] ); coords_[coordIndex].statuses_[rank][i] = true; } } @@ -1378,12 +1378,10 @@ void AxpyInterface2::Wait( const Matrix& Z ) template void AxpyInterface2::Wait( Matrix& Z ) -{ - Wait( const_cast&>( Z ) ); -} +{ Wait( const_cast&>( Z ) ); } template -void AxpyInterface2::Waitall () +void AxpyInterface2::Waitall() { DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Waitall" ) ) @@ -1399,30 +1397,30 @@ void AxpyInterface2::Waitall () const Int numCoords = coords_.size(); // data - for ( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + for( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) { - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); matrices_[matrixIndex].statuses_[rank][i] = true; } } } // coordinates - for ( int coordIndex = 0; coordIndex < numCoords; ++coordIndex ) + for( int coordIndex = 0; coordIndex < numCoords; ++coordIndex ) { - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); - for ( int i = 0; i < numCoordStatuses; i++ ) + for( int i = 0; i < numCoordStatuses; i++ ) { - mpi::Wait ( coords_[coordIndex].requests_[rank][i] ); + mpi::Wait( coords_[coordIndex].requests_[rank][i] ); coords_[coordIndex].statuses_[rank][i] = true; } } @@ -1447,9 +1445,9 @@ bool AxpyInterface2::Test( const Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -1459,9 +1457,9 @@ bool AxpyInterface2::Test( const Matrix& Z ) } // search for matrix base in coords - for ( Int c = 0; c < numCoords; c++ ) + for( Int c = 0; c < numCoords; c++ ) { - if ( coords_[c].base_ == base_address ) + if( coords_[c].base_ == base_address ) { coordIndex = c; break; @@ -1471,40 +1469,40 @@ bool AxpyInterface2::Test( const Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices && - coordIndex == numCoords ) + if( matrixIndex == numMatrices && + coordIndex == numCoords ) return true; - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); - if ( matrices_[matrixIndex].statuses_[rank][i] ) + if( matrices_[matrixIndex].statuses_[rank][i] ) return false; } } - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + if( coords_[coordIndex].statuses_[rank].size() == 0 ) continue; - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); - for ( int i = 0; i < numCoordStatuses; i++ ) + for( int i = 0; i < numCoordStatuses; i++ ) { coords_[coordIndex].statuses_[rank][i] = - !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + !mpi::Test( coords_[coordIndex].requests_[rank][i] ); - if ( coords_[coordIndex].statuses_[rank][i] ) + if( coords_[coordIndex].statuses_[rank][i] ) return false; } } @@ -1514,9 +1512,7 @@ bool AxpyInterface2::Test( const Matrix& Z ) template bool AxpyInterface2::Test( Matrix& Z ) -{ - return Test( const_cast&>( Z ) ); -} +{ return Test( const_cast&>( Z ) ); } template bool AxpyInterface2::TestAny( const Matrix& Z ) @@ -1536,9 +1532,9 @@ bool AxpyInterface2::TestAny( const Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -1548,9 +1544,9 @@ bool AxpyInterface2::TestAny( const Matrix& Z ) } // search for matrix base in coords - for ( Int c = 0; c < numCoords; c++ ) + for( Int c = 0; c < numCoords; c++ ) { - if ( coords_[c].base_ == base_address ) + if( coords_[c].base_ == base_address ) { coordIndex = c; break; @@ -1560,42 +1556,42 @@ bool AxpyInterface2::TestAny( const Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices && - coordIndex == numCoords ) + if( matrixIndex == numMatrices && + coordIndex == numCoords ) return true; - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); - if ( matrices_[matrixIndex].statuses_[rank][i] ) + if( matrices_[matrixIndex].statuses_[rank][i] ) continue; else return true; } } - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + if( coords_[coordIndex].statuses_[rank].size() == 0 ) continue; - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); - for ( int i = 0; i < numCoordStatuses; i++ ) + for( int i = 0; i < numCoordStatuses; i++ ) { coords_[coordIndex].statuses_[rank][i] = - !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + !mpi::Test( coords_[coordIndex].requests_[rank][i] ); - if ( coords_[coordIndex].statuses_[rank][i] ) + if( coords_[coordIndex].statuses_[rank][i] ) continue; else return true; @@ -1607,9 +1603,7 @@ bool AxpyInterface2::TestAny( const Matrix& Z ) template bool AxpyInterface2::TestAny( Matrix& Z ) -{ - return TestAny( const_cast&>( Z ) ); -} +{ return TestAny( const_cast&>( Z ) ); } template bool AxpyInterface2::Testall() @@ -1627,42 +1621,42 @@ bool AxpyInterface2::Testall() const Int numCoords = coords_.size(); // data - for ( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + for( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) { - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); - if ( matrices_[matrixIndex].statuses_[rank][i] ) + if( matrices_[matrixIndex].statuses_[rank][i] ) return false; } } } // coordinates - for ( int coordIndex = 0; coordIndex < numCoords; ++coordIndex ) + for( int coordIndex = 0; coordIndex < numCoords; ++coordIndex ) { - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( coords_[coordIndex].statuses_[rank].size() == 0 ) + if( coords_[coordIndex].statuses_[rank].size() == 0 ) continue; - const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size (); + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); - for ( int i = 0; i < numCoordStatuses; i++ ) + for( int i = 0; i < numCoordStatuses; i++ ) { coords_[coordIndex].statuses_[rank][i] = - !mpi::Test ( coords_[coordIndex].requests_[rank][i] ); + !mpi::Test( coords_[coordIndex].requests_[rank][i] ); - if ( coords_[coordIndex].statuses_[rank][i] ) + if( coords_[coordIndex].statuses_[rank][i] ) return false; } } @@ -1688,30 +1682,29 @@ void AxpyInterface2::Flush( const Matrix& Z ) bool DONE = false; mpi::Status status; - while ( !DONE ) + while( !DONE ) { - if ( mpi::IProbe ( mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status ) ) + if( mpi::IProbe( mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm(), status ) ) { - switch ( status.MPI_TAG ) - { - case DATA_PUT_TAG: - { - HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); - break; - } - - case DATA_ACC_TAG: + switch( status.MPI_TAG ) { - HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); - break; - } - } + case DATA_PUT_TAG: + { + HandleLocalToGlobalData( Z, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc( Z, status.MPI_SOURCE ); + break; + } + } } // wait for requests to // complete one by one - WaitAny ( Z ); - DONE = Test ( Z ); + WaitAny( Z ); + DONE = Test( Z ); } } @@ -1729,86 +1722,86 @@ void AxpyInterface2::Flush( Matrix& Z ) bool DONE = false; mpi::Status status; - while ( !DONE ) + while( !DONE ) { - if ( mpi::IProbe ( mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm (), status ) ) + if( mpi::IProbe( mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm(), status ) ) { - switch ( status.MPI_TAG ) + switch( status.MPI_TAG ) { - case DATA_PUT_TAG: - { - HandleLocalToGlobalData ( Z, status.MPI_SOURCE ); - break; - } - - case DATA_ACC_TAG: - { - HandleLocalToGlobalAcc ( Z, status.MPI_SOURCE ); - break; - } - - case REQUEST_GET_TAG: - { - HandleGlobalToLocalData ( Z ); - break; - } + case DATA_PUT_TAG: + { + HandleLocalToGlobalData( Z, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc( Z, status.MPI_SOURCE ); + break; + } + case REQUEST_GET_TAG: + { + HandleGlobalToLocalData( Z ); + break; + } } } // wait for requests to // complete one by one - WaitAny ( Z ); - DONE = Test ( Z ); + WaitAny( Z ); + DONE = Test( Z ); } } template -void AxpyInterface2::HandleLocalToGlobalData ( const Matrix& Z, Int source ) +void AxpyInterface2::HandleLocalToGlobalData( const Matrix& Z, Int source ) { DistMatrix& Y = *GlobalArrayPut_; - const Grid& g = Y.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); int height = Z.Height(); int width = Z.Width(); + // post receive for coordinates Int coord[3]; - mpi::TaggedRecv ( coord, 3, source, - COORD_PUT_TAG, g.VCComm() ); + mpi::TaggedRecv( coord, 3, source, + COORD_PUT_TAG, g.VCComm() ); Int i = coord[0]; Int j = coord[1]; Int count = coord[2]; // data vector std::vector getVector_; - getVector_.resize ( count ); - - DEBUG_ONLY ( if ( count < Int ( sizeof ( T ) ) ) - LogicError ( "Count was too small" ); ) - DEBUG_ONLY ( if ( Int ( getVector_.size () ) != count ) - LogicError ( "Not enough space allocated" ); ) - // post receive for data - T* getBuffer = getVector_.data(); - - mpi::TaggedRecv ( getBuffer, count, source, - DATA_PUT_TAG, g.VCComm() ); + getVector_.resize( count ); + + DEBUG_ONLY( if( count < Int( sizeof( T ) ) ) + LogicError( "Count was too small" ); ) + DEBUG_ONLY( if( Int( getVector_.size() ) != count ) + LogicError( "Not enough space allocated" ); ) + + // post receive for data + T* getBuffer = getVector_.data(); + mpi::TaggedRecv( getBuffer, count, source, + DATA_PUT_TAG, g.VCComm() ); + // Update Y const T* XBuffer = const_cast ( getBuffer ); const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; - const Int colShift = Shift ( myRow, colAlign, r ); - const Int rowShift = Shift ( myCol, rowAlign, c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); - const Int iLocalOffset = Length ( i, Y.ColShift(), r ); - const Int jLocalOffset = Length ( j, Y.RowShift(), c ); - - for ( Int t = 0; t < localWidth; ++t ) + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + + for( Int t = 0; t < localWidth; ++t ) { - T* YCol = Y.Buffer ( iLocalOffset, jLocalOffset + t ); + T* YCol = Y.Buffer( iLocalOffset, jLocalOffset + t ); const T* XCol = &XBuffer[t * localHeight]; - MemCopy ( YCol, XCol, localHeight ); + MemCopy( YCol, XCol, localHeight ); } // Free the memory @@ -1817,59 +1810,58 @@ void AxpyInterface2::HandleLocalToGlobalData ( const Matrix& Z, Int source template void AxpyInterface2::HandleLocalToGlobalData( Matrix& Z, Int source ) -{ - HandleLocalToGlobalData( const_cast&>( Z ), source ); -} +{ HandleLocalToGlobalData( const_cast&>( Z ), source ); } // replica of above function except this accumulates template -void AxpyInterface2::HandleLocalToGlobalAcc ( const Matrix& Z, Int source ) +void AxpyInterface2::HandleLocalToGlobalAcc( const Matrix& Z, Int source ) { DistMatrix& Y = *GlobalArrayPut_; - const Grid& g = Y.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); const int height = Z.Height(); const int width = Z.Width(); + // post receive for coordinates Int coord[3]; - mpi::TaggedRecv ( coord, 3, source, - COORD_ACC_TAG, g.VCComm() ); + mpi::TaggedRecv( coord, 3, source, + COORD_ACC_TAG, g.VCComm() ); Int i = coord[0]; Int j = coord[1]; Int count = coord[2]; // data buffer std::vector getVector_; - getVector_.resize ( count ); - - DEBUG_ONLY ( if ( count < Int ( sizeof ( T ) ) ) - LogicError ( "Count was too small" ); ) - DEBUG_ONLY ( if ( Int ( getVector_.size () ) != count ) - LogicError ( "Not enough space allocated" ); ) - // post receive for data - T* getBuffer = getVector_.data(); - - mpi::TaggedRecv ( getBuffer, count, source, - DATA_ACC_TAG, g.VCComm() ); + getVector_.resize( count ); + + DEBUG_ONLY( if( count < Int( sizeof( T ) ) ) + LogicError( "Count was too small" ); ) + DEBUG_ONLY( if( Int( getVector_.size() ) != count ) + LogicError( "Not enough space allocated" ); ) + + // post receive for data + T* getBuffer = getVector_.data(); + mpi::TaggedRecv( getBuffer, count, source, + DATA_ACC_TAG, g.VCComm() ); // Update Y const T* XBuffer = const_cast ( getBuffer ); const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; - const Int colShift = Shift ( myRow, colAlign, r ); - const Int rowShift = Shift ( myCol, rowAlign, c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); - const Int iLocalOffset = Length ( i, Y.ColShift(), r ); - const Int jLocalOffset = Length ( j, Y.RowShift(), c ); - - for ( Int t = 0; t < localWidth; ++t ) + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + + for( Int t = 0; t < localWidth; ++t ) { - T* YCol = Y.Buffer ( iLocalOffset, jLocalOffset + t ); + T* YCol = Y.Buffer( iLocalOffset, jLocalOffset + t ); const T* XCol = &XBuffer[t * localHeight]; - for ( Int s = 0; s < localHeight; ++s ) + for( Int s = 0; s < localHeight; ++s ) YCol[s] += XCol[s]; } @@ -1879,17 +1871,15 @@ void AxpyInterface2::HandleLocalToGlobalAcc ( const Matrix& Z, Int source template void AxpyInterface2::HandleLocalToGlobalAcc( Matrix& Z, Int source ) -{ - HandleLocalToGlobalAcc( const_cast&>( Z ), source ); -} +{ HandleLocalToGlobalAcc( const_cast&>( Z ), source ); } // handle request for data, post a matching isend template -void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) +void AxpyInterface2::HandleGlobalToLocalData( Matrix& Z ) { - DEBUG_ONLY ( CallStackEntry cse ( "AxpyInterface::HandleGlobalToLocalData" ) ) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::HandleGlobalToLocalData" ) ) - if ( !toBeAttachedForGet_ ) + if( !toBeAttachedForGet_ ) LogicError( "Local matrix cannot be updated" ); const DistMatrix& Y = *GlobalArrayGet_; @@ -1908,86 +1898,86 @@ void AxpyInterface2::HandleGlobalToLocalData ( Matrix& Z ) const Int height = Z.Height(); const Int width = Z.Width(); - for ( Int step = 0; step < p; step++ ) + for( Int step = 0; step < p; step++ ) { mpi::Status status; - if ( mpi::IProbe ( mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm (), status ) ) + if( mpi::IProbe( mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm(), status ) ) { const Int source = status.MPI_SOURCE; // post receive for coordinates Int coord[3]; - mpi::TaggedRecv ( coord, 3, source, - REQUEST_GET_TAG, g.VCComm() ); + mpi::TaggedRecv( coord, 3, source, + REQUEST_GET_TAG, g.VCComm() ); i = coord[0]; j = coord[1]; // we need the localwidth/height here, // used also to calculate numEntries const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; - const Int colShift = Shift ( myRow, colAlign, r ); - const Int rowShift = Shift ( myCol, rowAlign, c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); - const Int iLocalOffset = Length ( i, Y.ColShift (), r ); - const Int jLocalOffset = Length ( j, Y.RowShift (), c ); + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); const Int numEntries = localHeight * localWidth; - DEBUG_ONLY ( if ( numEntries < Int ( sizeof ( T ) ) ) - LogicError ( "Count was too small" ); ) + DEBUG_ONLY( if( numEntries < Int( sizeof( T ) ) ) + LogicError( "Count was too small" ); ) const Int index = - NextIndexData ( source, - numEntries, - Buffer, - &matrix_index ); + NextIndexData( source, + numEntries, + Buffer, + &matrix_index ); - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[source][index].size () ) != - numEntries ) LogicError ( "Error in NextIndexData" ); ) - T* replyBuffer = matrices_[matrix_index].data_[source][index].data (); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[source][index].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + T* replyBuffer = matrices_[matrix_index].data_[source][index].data(); - for ( Int t = 0; t < localWidth; ++t ) + for( Int t = 0; t < localWidth; ++t ) { T* sendCol = &replyBuffer[t * localHeight]; - const T* XCol = Y.LockedBuffer ( iLocalOffset, jLocalOffset + t ); - MemCopy ( sendCol, XCol, localHeight ); + const T* XCol = Y.LockedBuffer( iLocalOffset, jLocalOffset + t ); + MemCopy( sendCol, XCol, localHeight ); } // Fire off non-blocking send - mpi::TaggedISend ( replyBuffer, numEntries, source, - DATA_GET_TAG, g.VCComm (), - matrices_[matrix_index].requests_[source][index] ); + mpi::TaggedISend( replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm(), + matrices_[matrix_index].requests_[source][index] ); } // receive data - if ( mpi::IProbe - ( mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm (), status ) ) + if( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm(), status ) ) { const Int source = status.MPI_SOURCE; // Ensure that we have a recv buffer const Int count = mpi::GetCount ( status ); - recvVector_.resize ( count ); - T* recvBuffer = recvVector_.data (); + recvVector_.resize( count ); + T* recvBuffer = recvVector_.data(); // Receive the data mpi::TaggedRecv - ( recvBuffer, count, source, DATA_GET_TAG, g.VCComm () ); + ( recvBuffer, count, source, DATA_GET_TAG, g.VCComm() ); // Compute the local heights and offsets - const Int myRow = g.Row (); - const Int myCol = g.Col (); - const Int colAlign = ( Y.ColAlign () + i ) % r; - const Int rowAlign = ( Y.RowAlign () + j ) % c; - const Int colShift = Shift ( myRow, colAlign, r ); - const Int rowShift = Shift ( myCol, rowAlign, c ); - const Int localHeight = Length ( height, colShift, r ); - const Int localWidth = Length ( width, rowShift, c ); + const Int myRow = g.Row(); + const Int myCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); // Unpack the local matrix - for ( Int t = 0; t < localWidth; ++t ) + for( Int t = 0; t < localWidth; ++t ) { - T* YCol = Z.Buffer ( 0, rowShift + t * c ); + T* YCol = Z.Buffer( 0, rowShift + t * c ); const T* XCol = &recvBuffer[t * localHeight]; - for ( Int s = 0; s < localHeight; ++s ) + for( Int s = 0; s < localHeight; ++s ) YCol[colShift + s * r] = XCol[s]; } } @@ -2003,7 +1993,7 @@ void AxpyInterface2::Detach() DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Detach" ) ) // destructor will call detach again... - if ( detached_ ) + if( detached_ ) return; if( !attached_ ) @@ -2013,15 +2003,19 @@ void AxpyInterface2::Detach() GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); const Int p = g.Size(); + mpi::Barrier( g.VCComm() ); + attached_ = false; detached_ = true; + toBeAttachedForPut_ = false; toBeAttachedForGet_ = false; + GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; - if ( !dataVectors_.empty() ) + if( !dataVectors_.empty() ) dataVectors_.clear(); matrices_.clear(); diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp index d7129ab04b..a3e6420747 100644 --- a/src/core/RmaInterface.cpp +++ b/src/core/RmaInterface.cpp @@ -21,7 +21,7 @@ namespace El template RmaInterface::RmaInterface() : GlobalArrayPut_( 0 ), GlobalArrayGet_( 0 ), - matrices_( 0 ), window ( MPI_WIN_NULL ), + matrices_( 0 ), window( MPI_WIN_NULL ), putVector_( 0 ), getVector_( 0 ), toBeAttachedForPut_( false ), toBeAttachedForGet_( false ), attached_( false ), detached_( true ) @@ -31,12 +31,16 @@ template RmaInterface::RmaInterface( DistMatrix& Z ) { DEBUG_ONLY( CallStackEntry cse( "RmaInterface::RmaInterface" ) ) + attached_ = false; detached_ = true; + toBeAttachedForGet_ = false; toBeAttachedForPut_ = false; + GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; + window = MPI_WIN_NULL; } @@ -47,32 +51,36 @@ template RmaInterface::RmaInterface( const DistMatrix& X ) { DEBUG_ONLY( CallStackEntry cse( "RmaInterface::RmaInterface" ) ) + attached_ = false; detached_ = true; + toBeAttachedForGet_ = false; toBeAttachedForPut_ = false; + GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; + window = MPI_WIN_NULL; } template RmaInterface::~RmaInterface() { - if( std::uncaught_exception() ) - { - std::ostringstream os; - os << "Uncaught exception detected during RmaInterface destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << std::endl; - std::cerr << os.str(); - DEBUG_ONLY( DumpCallStack() ) - } - else - Detach(); + if( std::uncaught_exception() ) + { + std::ostringstream os; + os << "Uncaught exception detected during RmaInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY( DumpCallStack() ) + } + else + Detach(); } template @@ -82,7 +90,7 @@ void RmaInterface::Attach( DistMatrix& Z ) // attached_ will be only set in Attach // and only unset in Detach - if ( !attached_ && detached_ ) + if( !attached_ && detached_ ) { attached_ = true; detached_ = false; @@ -98,10 +106,11 @@ void RmaInterface::Attach( DistMatrix& Z ) toBeAttachedForPut_ = true; GlobalArrayGet_ = &Z; toBeAttachedForGet_ = true; - const Grid& g = Z.Grid(); - const Int p = g.Size (); + + const Grid& g = Z.Grid(); + const Int p = g.Size(); - if ( matrices_.empty() ) + if( matrices_.empty() ) { struct matrix_params_ mp; mp.data_.resize( p ); @@ -113,7 +122,7 @@ void RmaInterface::Attach( DistMatrix& Z ) matrices_.push_back( mp ); } - if ( putVector_.empty() ) + if( putVector_.empty() ) { getVector_.resize( p ); putVector_.resize( p ); @@ -121,12 +130,12 @@ void RmaInterface::Attach( DistMatrix& Z ) // TODO rma related checks // creation of window - const Int numEntries = Z.LocalHeight () * Z.LocalWidth (); + const Int numEntries = Z.LocalHeight() * Z.LocalWidth(); const Int bufferSize = numEntries * sizeof( T ); - void* baseptr = reinterpret_cast( Z.Buffer () ); + void* baseptr = reinterpret_cast( Z.Buffer() ); assert( baseptr != NULL ); - mpi::WindowCreate ( baseptr, bufferSize, g.VCComm (), window ); - mpi::WindowLock ( window ); + mpi::WindowCreate( baseptr, bufferSize, g.VCComm(), window ); + mpi::WindowLock( window ); } } @@ -136,7 +145,7 @@ void RmaInterface::Attach( const DistMatrix& X ) { DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Attach" ) ) - if ( !attached_ && detached_ ) + if( !attached_ && detached_ ) { attached_ = true; detached_ = false; @@ -150,19 +159,20 @@ void RmaInterface::Attach( const DistMatrix& X ) toBeAttachedForGet_ = true; GlobalArrayPut_ = 0; toBeAttachedForPut_ = false; - const Grid& g = X.Grid(); - const Int p = g.Size (); + + const Grid& g = X.Grid(); + const Int p = g.Size(); - if ( getVector_.size() != p ) + if( getVector_.size() != p ) getVector_.resize( p ); //TODO rma related checks - const Int numEntries = X.LocalHeight () * X.LocalWidth (); + const Int numEntries = X.LocalHeight() * X.LocalWidth(); const Int bufferSize = numEntries * sizeof( T ); - void* baseptr = static_cast( const_cast( X.LockedBuffer () ) ); - assert ( baseptr != NULL ); - mpi::WindowCreate ( baseptr, bufferSize, g.VCComm (), window ); - mpi::WindowLock ( window ); + void* baseptr = static_cast( const_cast( X.LockedBuffer() ) ); + assert( baseptr != NULL ); + mpi::WindowCreate( baseptr, bufferSize, g.VCComm(), window ); + mpi::WindowLock( window ); } } @@ -172,23 +182,23 @@ Int RmaInterface::NextIndex ( Int dataSize, std::deque >& dataVectors ) { - DEBUG_ONLY ( CallStackEntry cse ( "RmaInterface::NextIndex" ) ) - const Int numCreated = dataVectors.size (); - dataVectors.resize ( numCreated + 1 ); - dataVectors[numCreated].resize ( dataSize ); + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::NextIndex" ) ) + const Int numCreated = dataVectors.size(); + dataVectors.resize( numCreated + 1 ); + dataVectors[numCreated].resize( dataSize ); return numCreated; } // for request-based passive rma template -Int RmaInterface::NextIndex ( +Int RmaInterface::NextIndex( Int target, Int dataSize, const void* base_address, Int* mindex ) { - DEBUG_ONLY ( CallStackEntry cse ( "RmaInterface::NextIndex" ) ) - assert ( base_address != NULL ); + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::NextIndex" ) ) + assert( base_address != NULL ); Int matrixIndex = 0; const Grid& g = ( toBeAttachedForPut_ ? GlobalArrayPut_->Grid() : @@ -197,16 +207,16 @@ Int RmaInterface::NextIndex ( const Int numMatrices = matrices_.size(); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; } // uninitiated, first time - if ( matrices_[m].base_ == NULL ) + if( matrices_[m].base_ == NULL ) { matrices_[m].base_ = base_address; matrixIndex = m; @@ -217,7 +227,7 @@ Int RmaInterface::NextIndex ( } // need to create new object - if ( matrixIndex == numMatrices ) + if( matrixIndex == numMatrices ) { struct matrix_params_ mp; mp.data_.resize( p ); @@ -232,35 +242,35 @@ Int RmaInterface::NextIndex ( // go through the request, data, // status objects - const Int numCreated = matrices_[matrixIndex].data_[target].size (); - - DEBUG_ONLY ( if ( numCreated != Int ( matrices_[matrixIndex].requests_[target].size () ) || - numCreated != Int ( matrices_[matrixIndex].statuses_[target].size () ) ) - LogicError ( "size mismatch" ); ) + const Int numCreated = matrices_[matrixIndex].data_[target].size(); - for ( Int i = 0; i < numCreated; ++i ) + DEBUG_ONLY( if( numCreated != Int( matrices_[matrixIndex].requests_[target].size() ) || + numCreated != Int( matrices_[matrixIndex].statuses_[target].size() ) ) + LogicError( "size mismatch" ); ) + + for( Int i = 0; i < numCreated; ++i ) { - // If this request is still running, - // test to see if it finished. - if ( matrices_[matrixIndex].statuses_[target][i] ) - { - const bool finished = mpi::Test ( matrices_[matrixIndex].requests_[target][i] ); + // If this request is still running, + // test to see if it finished. + if( matrices_[matrixIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test( matrices_[matrixIndex].requests_[target][i] ); matrices_[matrixIndex].statuses_[target][i] = !finished; } - if ( !matrices_[matrixIndex].statuses_[target][i] ) + if( !matrices_[matrixIndex].statuses_[target][i] ) { - matrices_[matrixIndex].statuses_[target][i] = true; - matrices_[matrixIndex].data_[target][i].resize ( dataSize ); + matrices_[matrixIndex].statuses_[target][i] = true; + matrices_[matrixIndex].data_[target][i].resize( dataSize ); *mindex = matrixIndex; return i; } } - matrices_[matrixIndex].data_[target].resize ( numCreated + 1 ); - matrices_[matrixIndex].data_[target][numCreated].resize ( dataSize ); - matrices_[matrixIndex].requests_[target].push_back ( mpi::REQUEST_NULL ); - matrices_[matrixIndex].statuses_[target].push_back ( true ); + matrices_[matrixIndex].data_[target].resize( numCreated + 1 ); + matrices_[matrixIndex].data_[target][numCreated].resize( dataSize ); + matrices_[matrixIndex].requests_[target].push_back( mpi::REQUEST_NULL ); + matrices_[matrixIndex].statuses_[target].push_back( true ); *mindex = matrixIndex; return numCreated; } @@ -274,7 +284,7 @@ void RmaInterface::Rput( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -292,14 +302,15 @@ void RmaInterface::Rput( const Matrix& Z, Int i, Int j ) const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); - const Int YLDim = Y.LDim (); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + const Int YLDim = Y.LDim(); const T* XBuffer = Z.LockedBuffer(); const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); Int matrix_index; @@ -317,16 +328,15 @@ void RmaInterface::Rput( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = - NextIndex ( destination, - numEntries, - Buffer, - &matrix_index ); - - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[destination][index].size () ) != - numEntries ) LogicError ( "Error in NextIndex" ); ) + NextIndex( destination, + numEntries, + Buffer, + &matrix_index ); - T* sendBuffer = reinterpret_cast( matrices_[matrix_index].data_[destination][index].data () ); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][index].size() ) != + numEntries ) LogicError( "Error in NextIndex" ); ) + T* sendBuffer = reinterpret_cast( matrices_[matrix_index].data_[destination][index].data() ); for( Int t=0; t::Rput( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; // put - mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); - mpi::Rput ( &sendBuffer[t*localHeight], localHeight, - destination, disp, localHeight, window, - matrices_[matrix_index].requests_[destination][index] ); + mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); + mpi::Rput( &sendBuffer[t*localHeight], localHeight, + destination, disp, localHeight, window, + matrices_[matrix_index].requests_[destination][index] ); } } @@ -362,7 +372,7 @@ void RmaInterface::Racc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Racc" ) ) - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated." ); if( i < 0 || j < 0 ) @@ -383,15 +393,16 @@ void RmaInterface::Racc( const Matrix& Z, Int i, Int j ) const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); - const Int YLDim = Y.LDim (); + const Int YLDim = Y.LDim(); + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); const T* XBuffer = Z.LockedBuffer(); const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); Int matrix_index; - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -408,16 +419,15 @@ void RmaInterface::Racc( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = - NextIndex ( destination, - numEntries, - Buffer, - &matrix_index ); + NextIndex( destination, + numEntries, + Buffer, + &matrix_index ); - DEBUG_ONLY ( if - ( Int ( matrices_[matrix_index].data_[destination][index].size () ) != - numEntries ) LogicError ( "Error in NextIndex" ); ) - - T* sendBuffer = reinterpret_cast( matrices_[matrix_index].data_[destination][index].data () ); + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][index].size() ) != + numEntries ) LogicError( "Error in NextIndex" ); ) + T* sendBuffer = reinterpret_cast( matrices_[matrix_index].data_[destination][index].data() ); for( Int t=0; t::Racc( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; // acc - mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); - mpi::Racc ( &sendBuffer[t*localHeight], localHeight, - destination, disp, localHeight, window, - matrices_[matrix_index].requests_[destination][index] ); + mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); + mpi::Racc( &sendBuffer[t*localHeight], localHeight, + destination, disp, localHeight, window, + matrices_[matrix_index].requests_[destination][index] ); } } @@ -455,7 +465,7 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -478,9 +488,9 @@ void RmaInterface::Put( const Matrix& Z, Int i, Int j ) const Int width = Z.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); - const Int YLDim = Y.LDim (); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + const Int YLDim = Y.LDim(); const T* XBuffer = Z.LockedBuffer(); for( Int step=0; step::Put( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = - NextIndex ( numEntries, - putVector_[destination] ); + NextIndex( numEntries, + putVector_[destination] ); T* sendBuffer = putVector_[destination][index].data(); for( Int t=0; t::Put( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; // put - mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); - mpi::Iput ( &sendBuffer[t*localHeight], localHeight, - destination, disp, localHeight, window ); + mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); + mpi::Iput( &sendBuffer[t*localHeight], localHeight, + destination, disp, localHeight, window ); } - mpi::FlushLocal ( destination, window ); + mpi::FlushLocal( destination, window ); } receivingRow = ( receivingRow + 1 ) % r; @@ -535,7 +545,7 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Acc" ) ) - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated." ); if( i < 0 || j < 0 ) @@ -556,12 +566,13 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); - const Int YLDim = Y.LDim (); + const Int YLDim = Y.LDim(); + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; const T* XBuffer = Z.LockedBuffer(); @@ -578,7 +589,7 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, + const Int index = RmaInterface::NextIndex( numEntries, putVector_[destination] ); T* sendBuffer = putVector_[destination][index].data(); @@ -591,12 +602,12 @@ void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; // acc - mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); - mpi::Iacc ( &sendBuffer[t*localHeight], localHeight, - destination, disp, localHeight, window ); + mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); + mpi::Iacc( &sendBuffer[t*localHeight], localHeight, + destination, disp, localHeight, window ); } - mpi::FlushLocal ( destination, window ); + mpi::FlushLocal( destination, window ); } receivingRow = ( receivingRow + 1 ) % r; @@ -619,30 +630,30 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) // a call to Attach with a non-const DistMatrix must set // toBeAttachedForGet_ also, if not then it is assumed that // the DistMatrix isn't attached - if ( !toBeAttachedForGet_ ) - LogicError ( "Cannot perform this operation as matrix is not attached." ); + if( !toBeAttachedForGet_ ) + LogicError( "Cannot perform this operation as matrix is not attached." ); const DistMatrix& X = *GlobalArrayGet_; - const Grid& g = X.Grid (); - const Int r = g.Height (); - const Int c = g.Width (); - const Int p = g.Size (); - const Int myRow = g.Row (); - const Int myCol = g.Col (); + const Grid& g = X.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); // local width and height const Int height = Z.Height(); const Int width = Z.Width(); - if ( i + height > X.Height () || j + width > X.Width () ) + if( i + height > X.Height() || j + width > X.Width() ) LogicError( "Submatrix out of bounds of global matrix" ); const Int colAlign = ( X.ColAlign() + i ) % r; const Int rowAlign = ( X.RowAlign() + j ) % c; - const Int iLocalOffset = Length ( i, X.ColShift (), r ); - const Int jLocalOffset = Length ( j, X.RowShift (), c ); - const Int XLDim = X.LDim (); + const Int iLocalOffset = Length( i, X.ColShift(), r ); + const Int jLocalOffset = Length( j, X.RowShift(), c ); + const Int XLDim = X.LDim(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -657,29 +668,29 @@ void RmaInterface::Get( Matrix& Z, Int i, Int j ) if( numEntries != 0 ) { const Int destination = receivingRow + r*receivingCol; - const Int index = RmaInterface::NextIndex ( numEntries, + const Int index = RmaInterface::NextIndex( numEntries, getVector_[destination] ); - T* getBuffer = getVector_[destination][index].data (); + T* getBuffer = getVector_[destination][index].data(); // get for( Int t=0; t::Iput( const Matrix& Z, Int i, Int j ) if( i < 0 || j < 0 ) LogicError( "Submatrix offsets must be non-negative" ); - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated" ); DistMatrix& Y = *GlobalArrayPut_; @@ -718,14 +729,15 @@ void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); - const Int YLDim = Y.LDim (); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + const Int YLDim = Y.LDim(); const T* XBuffer = Z.LockedBuffer(); for( Int step=0; step::Iput( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = - NextIndex ( + NextIndex( numEntries, putVector_[destination] ); @@ -756,9 +768,9 @@ void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; // put - mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); - mpi::Iput ( &sendBuffer[t*localHeight], localHeight, - destination, disp, localHeight, window ); + mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); + mpi::Iput( &sendBuffer[t*localHeight], localHeight, + destination, disp, localHeight, window ); } } @@ -776,7 +788,7 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) { DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Iacc" ) ) - if ( !toBeAttachedForPut_ ) + if( !toBeAttachedForPut_ ) LogicError( "Global matrix cannot be updated." ); if( i < 0 || j < 0 ) @@ -797,13 +809,14 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) const Int colAlign = ( Y.ColAlign() + i ) % r; const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int XLDim = Z.LDim(); - const Int YLDim = Y.LDim (); + const Int YLDim = Y.LDim(); + // local matrix width and height const Int height = Z.Height(); const Int width = Z.Width(); const T* XBuffer = Z.LockedBuffer(); - const Int iLocalOffset = Length( i, Y.ColShift (), r ); - const Int jLocalOffset = Length( j, Y.RowShift (), c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; @@ -820,7 +833,7 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) { const Int destination = receivingRow + r*receivingCol; const Int index = - NextIndex ( + NextIndex( numEntries, putVector_[destination] ); T* sendBuffer = putVector_[destination][index].data(); @@ -834,9 +847,9 @@ void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) thisSendCol[s] = thisXCol[colShift+s*r]; // acc - mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); - mpi::Iacc ( &sendBuffer[t*localHeight], localHeight, - destination, disp, localHeight, window ); + mpi::Aint disp = ( iLocalOffset + ( jLocalOffset+t ) * YLDim ) * sizeof( T ); + mpi::Iacc( &sendBuffer[t*localHeight], localHeight, + destination, disp, localHeight, window ); } } @@ -865,7 +878,7 @@ void RmaInterface::LocalFlush() if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError( "Must initiate transfer before flushing." ); - mpi::FlushLocal ( window ); + mpi::FlushLocal( window ); } // Local completion (specific to Z) upon @@ -881,10 +894,10 @@ void RmaInterface::LocalFlush( Matrix& Z ) // if there are no request based RMA pending // for Z, then this functions acts like Flush // local all - if ( !anyPendingXfers( Z ) ) + if( !anyPendingXfers( Z ) ) LocalFlush(); else - Wait ( Z ); + Wait( Z ); } // there is no use as of now in @@ -899,11 +912,11 @@ void RmaInterface::Flush( Matrix& Z ) if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) LogicError( "Must initiate transfer before flushing." ); - mpi::Flush ( window ); + mpi::Flush( window ); } template -bool RmaInterface::anyPendingXfers ( Matrix& Z ) +bool RmaInterface::anyPendingXfers( Matrix& Z ) { DEBUG_ONLY( CallStackEntry cse( "RmaInterface::anyPendingXfers" ) ) // by default, number of matrices @@ -913,9 +926,9 @@ bool RmaInterface::anyPendingXfers ( Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -925,7 +938,7 @@ bool RmaInterface::anyPendingXfers ( Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices ) + if( matrixIndex == numMatrices ) return false; return true; @@ -952,9 +965,9 @@ void RmaInterface::WaitAny( Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -964,22 +977,22 @@ void RmaInterface::WaitAny( Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices ) + if( matrixIndex == numMatrices ) return; // data - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { - if ( !matrices_[matrixIndex].statuses_[rank][i] ) + if( !matrices_[matrixIndex].statuses_[rank][i] ) { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); matrices_[matrixIndex].statuses_[rank][i] = true; return; } @@ -1004,9 +1017,9 @@ void RmaInterface::Wait( Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -1016,27 +1029,27 @@ void RmaInterface::Wait( Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices ) + if( matrixIndex == numMatrices ) return; // data - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); matrices_[matrixIndex].statuses_[rank][i] = true; } } } template -void RmaInterface::Waitall () +void RmaInterface::Waitall() { DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Waitall" ) ) @@ -1051,15 +1064,15 @@ void RmaInterface::Waitall () const Int numMatrices = matrices_.size(); // data - for ( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + for( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) { - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { - mpi::Wait ( matrices_[matrixIndex].requests_[rank][i] ); + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); matrices_[matrixIndex].statuses_[rank][i] = true; } } @@ -1083,9 +1096,9 @@ bool RmaInterface::Test( Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -1095,22 +1108,22 @@ bool RmaInterface::Test( Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices ) + if( matrixIndex == numMatrices ) return true; - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); - if ( matrices_[matrixIndex].statuses_[rank][i] ) + if( matrices_[matrixIndex].statuses_[rank][i] ) return false; } } @@ -1139,9 +1152,9 @@ bool RmaInterface::TestAny( Matrix& Z ) const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); // search for matrix base - for ( Int m = 0; m < numMatrices; m++ ) + for( Int m = 0; m < numMatrices; m++ ) { - if ( matrices_[m].base_ == base_address ) + if( matrices_[m].base_ == base_address ) { matrixIndex = m; break; @@ -1151,22 +1164,22 @@ bool RmaInterface::TestAny( Matrix& Z ) } // matrix not found - if ( matrixIndex == numMatrices ) + if( matrixIndex == numMatrices ) return true; - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); - if ( matrices_[matrixIndex].statuses_[rank][i] ) + if( matrices_[matrixIndex].statuses_[rank][i] ) continue; else return true; @@ -1191,21 +1204,21 @@ bool RmaInterface::Testall() const Int numMatrices = matrices_.size(); // data - for ( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + for( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) { - for ( int rank = 0; rank < p; ++rank ) + for( int rank = 0; rank < p; ++rank ) { - if ( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) continue; - const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size (); + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); - for ( int i = 0; i < numDataStatuses; i++ ) + for( int i = 0; i < numDataStatuses; i++ ) { matrices_[matrixIndex].statuses_[rank][i] = - !mpi::Test ( matrices_[matrixIndex].requests_[rank][i] ); + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); - if ( matrices_[matrixIndex].statuses_[rank][i] ) + if( matrices_[matrixIndex].statuses_[rank][i] ) return false; } } @@ -1220,7 +1233,7 @@ void RmaInterface::Detach() DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Detach" ) ) // destructor will call detach again... - if ( detached_ ) + if( detached_ ) return; if( !attached_ ) @@ -1229,18 +1242,25 @@ void RmaInterface::Detach() const Grid& g = ( toBeAttachedForPut_ ? GlobalArrayPut_->Grid() : GlobalArrayGet_->Grid() ); + mpi::Barrier( g.VCComm() ); + attached_ = false; detached_ = true; + toBeAttachedForPut_ = false; toBeAttachedForGet_ = false; + GlobalArrayPut_ = 0; GlobalArrayGet_ = 0; + putVector_.clear(); getVector_.clear(); + matrices_.clear(); - mpi::WindowUnlock ( window ); - mpi::WindowFree ( window ); + + mpi::WindowUnlock( window ); + mpi::WindowFree( window ); } #define PROTO(T) template class RmaInterface; From 361de46c54528aac61db1e1512834cdf2e941087 Mon Sep 17 00:00:00 2001 From: Sayan Ghosh Date: Wed, 25 Feb 2015 14:30:34 -0800 Subject: [PATCH 110/110] minor indentation --- src/core/AxpyInterface.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index 0eb4629f91..d3bbc59938 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -20,12 +20,11 @@ namespace El template bool AxpyInterface ::Finished() { DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Finished" ); - if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) - LogicError( "Not attached" ); ) - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : - globalToLocalMat_->Grid() ); + LogicError( "Not attached" ); ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid() : + globalToLocalMat_->Grid() ); const Int p = g.Size(); bool finished = true;