diff --git a/CMakeLists.txt b/CMakeLists.txt index de1d1f58e0..49486a1456 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,11 @@ option(EL_USE_64BIT_INTS "Use 64-bit integers where possible" OFF) option(EL_USE_CUSTOM_ALLTOALLV "Avoid MPI_Alltoallv for performance reasons" ON) option(EL_BARRIER_IN_ALLTOALLV "Barrier before posting non-blocking recvs" OFF) +# MPI misc. +# Enable MPI-3 routines +option(EL_ENABLE_RMA_AXPY "Choose new Rma Axpy interface implemented using MPI-3 one sided routines" ON) +option(EL_USE_IBARRIER_FOR_AXPY "Use MPI-3 IBarrier for synchronization in AxpyInterface" ON) + # If the version of METIS packaged with Elemental is to be built (the default), # then no METIS-specific variables need to be specified, but if the user prefers # to use their own version, then the root path of the installation should be diff --git a/cmake/config.h.cmake b/cmake/config.h.cmake index 9e375f56a2..d9565adb00 100644 --- a/cmake/config.h.cmake +++ b/cmake/config.h.cmake @@ -79,6 +79,11 @@ #cmakedefine EL_VECTOR_WARNINGS #cmakedefine EL_AVOID_OMP_FMA +/* MPI-3 related */ +#cmakedefine EL_ENABLE_RMA_AXPY +#cmakedefine EL_USE_IBARRIER_FOR_AXPY + + #cmakedefine EL_DECLSPEC #ifdef EL_DECLSPEC # define EL_EXPORT __declspec(dllexport) diff --git a/include/El/core.hpp b/include/El/core.hpp index 70f47cb405..47a623635a 100644 --- a/include/El/core.hpp +++ b/include/El/core.hpp @@ -151,6 +151,8 @@ template class BlockDistMatrix; #include "El/core/random/decl.hpp" #include "El/core/random/impl.hpp" #include "El/core/AxpyInterface.hpp" +#include "El/core/RmaInterface.hpp" +#include "El/core/AxpyInterface2.0.hpp" #include "El/core/Graph.hpp" // TODO: Sequential map diff --git a/include/El/core/AxpyInterface.hpp b/include/El/core/AxpyInterface.hpp index 88b9f6b834..c049de2704 100644 --- a/include/El/core/AxpyInterface.hpp +++ b/include/El/core/AxpyInterface.hpp @@ -7,8 +7,8 @@ This interface is mainly due to Martin Schatz, but it was put into its current form by Jack Poulson. - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ #pragma once @@ -24,16 +24,16 @@ using namespace AxpyTypeNS; template class AxpyInterface -{ +{ public: AxpyInterface(); ~AxpyInterface(); AxpyInterface( AxpyType type, DistMatrix& Z ); - AxpyInterface( AxpyType type, const DistMatrix& Z ); + AxpyInterface( AxpyType type, const DistMatrix& Z ); - void Attach( AxpyType type, DistMatrix& Z ); - void Attach( AxpyType type, const DistMatrix& Z ); + void Attach( AxpyType type, DistMatrix& Z ); + void Attach( AxpyType type, const DistMatrix& Z ); void Axpy( T alpha, Matrix& Z, Int i, Int j ); void Axpy( T alpha, const Matrix& Z, Int i, Int j ); @@ -41,48 +41,66 @@ class AxpyInterface void Detach(); private: - static const Int - DATA_TAG =1, - EOM_TAG =2, - DATA_REQUEST_TAG=3, - DATA_REPLY_TAG =4; - +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) + static const Int + DATA_TAG =1, + DATA_REQUEST_TAG=2, + DATA_REPLY_TAG =3; +#else + static const Int + DATA_TAG =1, + EOM_TAG =2, + DATA_REQUEST_TAG=3, + DATA_REPLY_TAG =4; +#endif + + //request object for polling on Issends bool attachedForLocalToGlobal_, attachedForGlobalToLocal_; + DistMatrix* localToGlobalMat_; const DistMatrix* globalToLocalMat_; - vector sentEomTo_, haveEomFrom_; - vector recvVector_; - vector eomSendRequests_; +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +#else + std::vector sentEomTo_, haveEomFrom_; + std::vector eomSendRequests_; +#endif + + std::vector> + sendingData_, sendingRequest_, sendingReply_; + std::vector> + dataSendRequests_, requestSendRequests_, replySendRequests_; - vector>> dataVectors_, requestVectors_, replyVectors_; - vector> sendingData_, sendingRequest_, sendingReply_; - vector> - dataSendRequests_, requestSendRequests_, replySendRequests_; + std::vector recvVector_; + std::vector>> + dataVectors_, requestVectors_, replyVectors_; byte sendDummy_, recvDummy_; + // Progress functions +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) + bool ReturnRequestStatuses(); +#else // Check if we are done with this attachment's work bool Finished(); - - // Progress functions - void UpdateRequestStatuses(); void HandleEoms(); - void HandleLocalToGlobalData(); - void HandleGlobalToLocalRequest(); void StartSendingEoms(); void FinishSendingEoms(); - - void AxpyLocalToGlobal( T alpha, const Matrix& X, Int i, Int j ); - void AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ); + void UpdateRequestStatuses(); +#endif Int ReadyForSend ( Int sendSize, - deque>& sendVectors, - deque& requests, - deque& requestStatuses ); + std::deque>& sendVectors, + std::deque& requests, + std::deque& requestStatuses ); + + void HandleLocalToGlobalData(); + void HandleGlobalToLocalRequest(); + + void AxpyLocalToGlobal( T alpha, const Matrix& X, Int i, Int j ); + void AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ); }; } // namespace El - #endif // ifndef EL_AXPYINTERFACE_HPP diff --git a/include/El/core/AxpyInterface2.0.hpp b/include/El/core/AxpyInterface2.0.hpp new file mode 100644 index 0000000000..9a54c61dfc --- /dev/null +++ b/include/El/core/AxpyInterface2.0.hpp @@ -0,0 +1,147 @@ +/* + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at + http://opensource.org/licenses/BSD-2-Clause +*/ +#pragma once +#ifndef EL_AXPYINTERFACE2_HPP +#define EL_AXPYINTERFACE2_HPP + +namespace El { +template +class AxpyInterface2 +{ +public: + AxpyInterface2(); + ~AxpyInterface2(); + + AxpyInterface2( DistMatrix& Z ); + AxpyInterface2( const DistMatrix& Z ); + + // collective epoch initialization routines + void Attach( DistMatrix& Z ); + void Attach( const DistMatrix& Z ); + void Detach(); + + // remote update routines + + // requires Flush for local+remote + // completion + void Iput( Matrix& Z, Int i, Int j ); + void Iput( const Matrix& Z, Int i, Int j ); + + void Iget( Matrix& Z, Int i, Int j ); + + void Iacc( Matrix& Z, Int i, Int j ); + void Iacc( const Matrix& Z, Int i, Int j ); + + // locally blocking update routines + // reuse input buffer when returns + void Acc( Matrix& Z, Int i, Int j ); + void Acc( const Matrix& Z, Int i, Int j ); + + void Put( Matrix& Z, Int i, Int j ); + void Put( const Matrix& Z, Int i, Int j ); + + // End to End blocking + // will be deprecated soon + void Eacc( Matrix& Z, Int i, Int j ); + void Eacc( const Matrix& Z, Int i, Int j ); + + void Eput( Matrix& Z, Int i, Int j ); + void Eput( const Matrix& Z, Int i, Int j ); + + void Get( Matrix& Z, Int i, Int j ); + + // synchronization routines + void Flush( Matrix& Z ); + void Flush( const Matrix& Z ); + +private: + + static const Int + DATA_PUT_TAG =1, + DATA_GET_TAG =2, + DATA_ACC_TAG =3, + REQUEST_GET_TAG =4, + COORD_ACC_TAG =5, + COORD_PUT_TAG =6; + + // struct for passing data + struct matrix_params_ + { + const void *base_; + std::vector>> + data_; + std::vector> + requests_; + std::vector> + statuses_; + }; + + std::vector matrices_; + + // struct for passing coordinates + struct coord_params_ + { + const void *base_; + std::vector>> + coord_; + std::vector> + requests_; + std::vector> + statuses_; + }; + + std::vector coords_; + + // for blocking interface + // copying input buffer in this + // intermediate buffer so that input + // buffer could be reused + std::vector>> + dataVectors_; + + DistMatrix* GlobalArrayPut_; + const DistMatrix* GlobalArrayGet_; + + bool toBeAttachedForPut_, toBeAttachedForGet_, + attached_, detached_; + + // next index for data and coord + Int NextIndexData ( + Int target, + Int dataSize, + const void* base_address, + Int *mindex ); + + Int NextIndexCoord ( + Int i, Int j, + Int target, + const void* base_address, + Int *cindex ); + + bool Testall(); + bool Test( Matrix& Z ); + bool Test( const Matrix& Z ); + bool TestAny( Matrix& Z ); + bool TestAny( const Matrix& Z ); + + void Waitall(); + void Wait( Matrix& Z ); + void Wait( const Matrix& Z ); + void WaitAny( Matrix& Z ); + void WaitAny( const Matrix& Z ); + + // these are only used for nonblocking + // update rountines + void HandleGlobalToLocalData( Matrix& Z ); + + void HandleLocalToGlobalData( Matrix& Z, Int source ); + void HandleLocalToGlobalAcc( Matrix& Z, Int source ); + + void HandleLocalToGlobalData( const Matrix& Z, Int source ); + void HandleLocalToGlobalAcc( const Matrix& Z, Int source ); +}; +} // namespace El +#endif // ifndef EL_AXPYINTERFACE2_HPP diff --git a/include/El/core/RmaInterface.hpp b/include/El/core/RmaInterface.hpp new file mode 100644 index 0000000000..9587081bbe --- /dev/null +++ b/include/El/core/RmaInterface.hpp @@ -0,0 +1,124 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Jeff Hammond (Intel) + All rights reserved. + + Authors: + Jeff Hammond adapted the RMA interface from the AXPY one. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at + http://opensource.org/licenses/BSD-2-Clause +*/ +#pragma once +#ifndef EL_RMAINTERFACE_HPP +#define EL_RMAINTERFACE_HPP + +namespace El { +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) +template +class RmaInterface +{ +public: + RmaInterface(); + ~RmaInterface(); + + RmaInterface( DistMatrix& Z ); + RmaInterface( const DistMatrix& Z ); + + void Attach( DistMatrix& Z ); + void Attach( const DistMatrix& Z ); + + // Local completion + void Put( Matrix& Z, Int i, Int j ); + void Put( const Matrix& Z, Int i, Int j ); + + void Get( Matrix& Z, Int i, Int j ); + + void Acc( Matrix& Z, Int i, Int j ); + void Acc( const Matrix& Z, Int i, Int j ); + + // No local completion + void Iput( Matrix& Z, Int i, Int j ); + void Iput( const Matrix& Z, Int i, Int j ); + + void Iacc( Matrix& Z, Int i, Int j ); + void Iacc( const Matrix& Z, Int i, Int j ); + + // Request based RMA + void Rput( Matrix& Z, Int i, Int j ); + void Rput( const Matrix& Z, Int i, Int j ); + + void Racc( Matrix& Z, Int i, Int j ); + void Racc( const Matrix& Z, Int i, Int j ); + + // Synchronization routines + void Flush( Matrix& Z ); + void Flush( const Matrix& Z ); + void LocalFlush( const Matrix& Z ); + void LocalFlush( Matrix& Z ); + void LocalFlush(); + + void Detach(); + +private: + + mpi::Window window; + + // struct for passing data + // for request based rma + struct matrix_params_ + { + const void *base_; + std::vector>> + data_; + std::vector> + requests_; + std::vector> + statuses_; + }; + + std::vector matrices_; + + // buffers for rma + std::vector>> + getVector_, putVector_; + + DistMatrix* GlobalArrayPut_; + const DistMatrix* GlobalArrayGet_; + + bool toBeAttachedForPut_, toBeAttachedForGet_, + attached_, detached_; + + // next index for data + Int NextIndex ( + Int dataSize, + std::deque >& dataVectors ); + + Int NextIndex ( + Int target, + Int dataSize, + const void* base_address, + Int* mindex ); + + // only relevant for request-based + // passive RMA + bool anyPendingXfers ( Matrix& Z ); + bool anyPendingXfers ( const Matrix& Z ); + + bool Testall(); + bool Test( Matrix& Z ); + bool Test( const Matrix& Z ); + bool TestAny( Matrix& Z ); + bool TestAny( const Matrix& Z ); + + void Waitall(); + void Wait( Matrix& Z ); + void Wait( const Matrix& Z ); + void WaitAny( Matrix& Z ); + void WaitAny( const Matrix& Z ); +}; +#endif // EL_ENABLE_RMA_AXPY +} // namespace El +#endif // ifndef EL_RMAINTERFACE_HPP diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp index f39e8c5e79..4585984111 100644 --- a/include/El/core/imports/mpi.hpp +++ b/include/El/core/imports/mpi.hpp @@ -10,7 +10,7 @@ #pragma once #ifndef EL_IMPORTS_MPI_HPP #define EL_IMPORTS_MPI_HPP - +#include namespace El { namespace mpi { @@ -30,6 +30,50 @@ namespace mpi { #endif #endif +// Use MPI-3 IBarrier in developing a Non-blocking +// consensus instead of El strict EOM matching +// see - Scalable communication protocols for +// dynamic sparse data exchange by Hoefler, et al +//#ifndef EL_USE_IBARRIER_FOR_AXPY +//#define EL_USE_IBARRIER_FOR_AXPY +//#endif + +//#ifndef EL_ENABLE_RMA_AXPY +//#define EL_ENABLE_RMA_AXPY +//#endif + +// Use derived datatypes for strided +// vector communication patterns +//#ifndef EL_USE_DERIVED_DATATYPE +//#define EL_USE_DERIVED_DATATYPE +//#endif + +// explicit progress for RMA +//#ifndef EL_EXPLICIT_PROGRESS +//#define EL_EXPLICIT_PROGRESS +//#endif + +// no acc ordering +//#ifndef EL_NO_ACC_ORDERING +//#define EL_NO_ACC_ORDERING +//#endif + +// put/get atomicity +//#ifndef EL_ENSURE_PUT_ATOMICITY +//#define EL_ENSURE_PUT_ATOMICITY +//#endif + +//#ifndef EL_ENSURE_GET_ATOMICITY +//#define EL_ENSURE_GET_ATOMICITY +//#endif + +#ifndef EL_INT_SAFE_CAST +#define EL_INT_SAFE_CAST(x) \ + (x < std::numeric_limits::max () && \ + x > std::numeric_limits::min ())? \ + static_cast(x): (-99999) +#endif + struct Comm { MPI_Comm comm; @@ -67,11 +111,43 @@ typedef MPI_Datatype Datatype; typedef MPI_Errhandler ErrorHandler; typedef MPI_Request Request; typedef MPI_Status Status; +typedef MPI_Message Message; typedef MPI_User_function UserFunction; - +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) +typedef MPI_Win Window; +typedef enum +{ + STRICT_ACC_ORDERING = 0, + PARTIAL_ACC_ORDERING = 2, + NO_ACC_ORDERING = 4 +} acc_order_t; +#endif +// for ddt +#ifdef EL_USE_DERIVED_DATATYPE +typedef struct El_strided_s +{ + unsigned num; + size_t* sizes; + MPI_Aint* offsets; +} El_strided_t; +typedef struct El_iov_s +{ + unsigned count; + size_t* sizes; + MPI_Aint* offsets; +} El_iov_t; +typedef enum +{ + FIXED_BLOCK_FIXED_STRIDE = 1, + FIXED_BLOCK_VAR_STRIDE = 2, + UNKNOWN_BLOCK_STRIDE = 4 +} vector_pattern_t; +#endif +typedef MPI_Info Info; // Standard constants const int ANY_SOURCE = MPI_ANY_SOURCE; const int ANY_TAG = MPI_ANY_TAG; +const int ERR_RANK = MPI_ERR_RANK; #ifdef EL_HAVE_MPI_QUERY_THREAD const int THREAD_SINGLE = MPI_THREAD_SINGLE; const int THREAD_FUNNELED = MPI_THREAD_FUNNELED; @@ -92,6 +168,7 @@ const ErrorHandler ERRORS_RETURN = MPI_ERRORS_RETURN; const ErrorHandler ERRORS_ARE_FATAL = MPI_ERRORS_ARE_FATAL; const Group GROUP_EMPTY = MPI_GROUP_EMPTY; const Request REQUEST_NULL = MPI_REQUEST_NULL; + const Op MAX = MPI_MAX; const Op MIN = MPI_MIN; const Op MAXLOC = MPI_MAXLOC; @@ -169,14 +246,143 @@ void Translate ( Comm origComm, int size, const int* origRanks, Comm newComm, int* newRanks ); +// Derived datatype +// ================ +#ifdef EL_USE_DERIVED_DATATYPE +// strided/vector to datatype +void StridedDatatype (El_strided_t* stride_descr, + mpi::Datatype old_type, mpi::Datatype* new_type, + size_t* source_dims); +void VectorDatatype (El_iov_t * vect_descr, + mpi::Datatype old_type, mpi::Datatype * new_type, + vector_pattern_t data_pattern); +#endif // EL_USE_DERIVED_DATATYPE +// MPI-3 one-sided +// =============== +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) +// Utilities +// --------- +void SetWindowProp ( Window& window, int prop ); +void CheckBounds ( Window & window, mpi::Datatype win_type, mpi::Datatype type, +size_t count, ptrdiff_t target_offset ); +#ifdef EL_EXPLICIT_PROGRESS +void RmaProgress ( Comm comm ); +#endif +long ReadInc (Window & win, Aint offset, + long inc, int fop_root); +// Window creation/update/delete +// ----------------------------- +void WindowLock( int rank, Window& window ); +void WindowLock( Window& window ); +void WindowUnlock( int rank, Window& window ); +void WindowUnlock( Window& window ); +void WindowCreate( void* baseptr, int size, Comm comm, Window& window ); +void WindowFree (Window & window); +// One-sided operations +// -------------------- +// put +// --- +template +void Iput (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Iput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Rput (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Rput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Iput( T source, int target_rank, Aint disp, Window& window ); +template +void Rput( T source, int target_rank, Aint disp, + Window& window, Request& request ); +// get +// --- +template +void Iget (R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Iget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Rget (R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Rget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Iget( T source, int target_rank, Aint disp, Window& window ); +template +void Rget( T source, int target_rank, Aint disp, + Window& window, Request& request ); +// acc +// --- +template +void Iacc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template +void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template +void Racc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, + Request & request); +template +void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, + Request & request); +template +void Iacc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template +void Racc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request); +template +void Iacc (const T source, int target_rank, Aint disp, Window & window); +template +void Racc (const T source, int target_rank, Aint disp, Window & window, + Request & request); +// Synchronization +// --------------- +void Flush( int target_rank, Window& window ); +void Flush( Window & window ); +void FlushLocal( int target_rank, Window& window ); +void FlushLocal( Window & window ); +#endif // EL_ENABLE_RMA_AXPY + // Utilities void Barrier( Comm comm ); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +void IBarrier( Comm comm, Request& request ); +#endif +void RequestFree( Request& request ); void Wait( Request& request ); void Wait( Request& request, Status& status ); +//TODO add another function for getting statuses +void WaitAny (int numRequests, Request * requests, Int * index); void WaitAll( int numRequests, Request* requests ); void WaitAll( int numRequests, Request* requests, Status* statuses ); bool Test( Request& request ); +bool Test( Request& request, Status& status ); +bool Testany( int count, Request* requests ); +bool Testany( int count, Request* requests, int& indx ); +bool Testany( int count, Request* requests, int& indx, Status& status ); bool IProbe( int source, int tag, Comm comm, Status& status ); +bool IProbe( int source, Comm comm, Status& status ); +bool IProbe( Comm comm, Status& status ); +void Probe ( int source, int tag, Comm comm, Status & status ); +void Probe ( int source, Comm comm, Status & status ); +void Probe ( Comm comm, Status & status ); +// matching probe +bool IMprobe( int source, int tag, Comm comm, Status& status, Message& message ); template int GetCount( Status& status ); @@ -253,6 +459,15 @@ T TaggedRecv( int from, int tag, Comm comm ); // If the recv count is one and the tag is irrelevant template T Recv( int from, Comm comm ); +// matched recv +template +void TaggedMrecv( R* buf, int count, Message & message ); +template +void TaggedMrecv( Complex* buf, int count, Message & message ); +template +void TaggedRecvS( R* buf, int count, int from, int tag, Comm comm, Status & status ); +template +void TaggedRecvS( Complex* buf, int count, int from, int tag, Comm comm, Status & status ); // Non-blocking recv // ----------------- diff --git a/src/core/AxpyInterface.cpp b/src/core/AxpyInterface.cpp index adf78da8d2..d3bbc59938 100644 --- a/src/core/AxpyInterface.cpp +++ b/src/core/AxpyInterface.cpp @@ -7,29 +7,29 @@ This interface is mainly due to Martin Schatz, but it was put into its current form by Jack Poulson. - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at http://opensource.org/licenses/BSD-2-Clause */ #include "El.hpp" -namespace El { - -template -bool AxpyInterface::Finished() +namespace El { - DEBUG_ONLY( - CallStackEntry cse("AxpyInterface::Finished"); - if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) - LogicError("Not attached"); - ) - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +#else +template bool AxpyInterface ::Finished() +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Finished" ); + if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) + LogicError( "Not attached" ); ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid() : globalToLocalMat_->Grid() ); + const Int p = g.Size(); + bool finished = true; - bool finished = true; - for( Int rank=0; rank::Finished() break; } } + return finished; } -template -void AxpyInterface::HandleEoms() +template void AxpyInterface ::HandleEoms() { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::HandleEoms")) - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::HandleEoms" ) ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid() : globalToLocalMat_->Grid() ); const Int p = g.Size(); - UpdateRequestStatuses(); // Try to progress our EOM sends - for( Int i=0; i::HandleEoms() break; } } + const Int numRequests = sendingRequest_[i].size(); - for( Int j=0; j::HandleEoms() break; } } + if( shouldSendEom ) { mpi::Request& request = eomSendRequests_[i]; @@ -95,6 +100,7 @@ void AxpyInterface::HandleEoms() } mpi::Status status; + if( mpi::IProbe( mpi::ANY_SOURCE, EOM_TAG, g.VCComm(), status ) ) { const Int source = status.MPI_SOURCE; @@ -102,180 +108,178 @@ void AxpyInterface::HandleEoms() haveEomFrom_[source] = true; } } +#endif // EL_USE_IBARRIER_FOR_AXPY -template -void AxpyInterface::HandleLocalToGlobalData() +template void AxpyInterface ::HandleLocalToGlobalData() { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::HandleLocalToGlobalData")) - DistMatrix& Y = *localToGlobalMat_; + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::HandleLocalToGlobalData" ) ) + DistMatrix & Y = *localToGlobalMat_; const Grid& g = Y.Grid(); const Int r = g.Height(); const Int c = g.Width(); const Int myRow = g.Row(); const Int myCol = g.Col(); - mpi::Status status; + if( mpi::IProbe( mpi::ANY_SOURCE, DATA_TAG, g.VCComm(), status ) ) { - // Message exists, so recv and pack + // Message exists, so recv and pack const Int count = mpi::GetCount( status ); - DEBUG_ONLY( - if( count < Int(4*sizeof(Int)+sizeof(T)) ) - LogicError("Count was too small"); - ) - const Int source = status.MPI_SOURCE; + + DEBUG_ONLY( if( count < Int( 4 * sizeof( Int ) + sizeof( T ) ) ) + LogicError( "Count was too small" ); ) + const Int source = status.MPI_SOURCE; + recvVector_.resize( count ); byte* recvBuffer = recvVector_.data(); mpi::TaggedRecv( recvBuffer, count, source, DATA_TAG, g.VCComm() ); - // Extract the header byte* head = recvBuffer; - const Int i = *reinterpret_cast(head); - head += sizeof(Int); - const Int j = *reinterpret_cast(head); - head += sizeof(Int); - const Int height = *reinterpret_cast(head); - head += sizeof(Int); - const Int width = *reinterpret_cast(head); - head += sizeof(Int); - const T alpha = *reinterpret_cast(head); - head += sizeof(T); - DEBUG_ONLY( - if( height < 0 || width < 0 ) - RuntimeError - ("Unpacked heights were negative:\n", - " i= ",i,std::hex,"(",i,")\n",std::dec, - " j= ",j,std::hex,"(",j,")\n",std::dec, - " height=",height,std::hex,"(",height,")\n",std::dec, - " width= ",width,std::hex,"(",width,")\n",std::dec, - " alpha= ",alpha); - if( i < 0 || j < 0 ) - RuntimeError - ("Unpacked offsets were negative:\n", - " i= ",i,std::hex,"(",i,")\n",std::dec, - " j= ",j,std::hex,"(",j,")\n",std::dec, - " height=",height,std::hex,"(",height,")\n",std::dec, - " width= ",width,std::hex,"(",width,")\n",std::dec, - " alpha= ",alpha); - if( i+height > Y.Height() || j+width > Y.Width() ) - RuntimeError - ("Unpacked submatrix was out of bounds:\n", - " i= ",i,std::hex,"(",i,")\n",std::dec, - " j= ",j,std::hex,"(",j,")\n",std::dec, - " height=",height,std::hex,"(",height,")\n",std::dec, - " width= ",width,std::hex,"(",width,")\n",std::dec, - " alpha= ",alpha); - ) - - // Update Y - const T* XBuffer = reinterpret_cast(head); - const Int colAlign = (Y.ColAlign()+i) % r; - const Int rowAlign = (Y.RowAlign()+j) % c; + const Int i = *reinterpret_cast ( head ); + head += sizeof( Int ); + const Int j = *reinterpret_cast ( head ); + head += sizeof( Int ); + const Int height = *reinterpret_cast ( head ); + head += sizeof( Int ); + const Int width = *reinterpret_cast ( head ); + head += sizeof( Int ); + const T alpha = *reinterpret_cast ( head ); + head += sizeof( T ); + DEBUG_ONLY ( if ( height < 0 || width < 0 ) + RuntimeError + ( "Unpacked heights were negative:\n", + " i= ", i, std::hex, "(", i, ")\n", std::dec, + " j= ", j, std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", height, ")\n", + std::dec, " width= ", width, std::hex, "(", width, + ")\n", std::dec, " alpha= ", alpha ); + if ( i < 0 || j < 0 ) + RuntimeError + ( "Unpacked offsets were negative:\n", + " i= ", i, std::hex, "(", i, ")\n", std::dec, + " j= ", j, std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", height, ")\n", + std::dec, " width= ", width, std::hex, "(", width, + ")\n", std::dec, " alpha= ", alpha ); + if ( i + height > Y.Height () || j + width > Y.Width () ) + RuntimeError + ( "Unpacked submatrix was out of bounds:\n", + " i= ", i, std::hex, "(", i, ")\n", std::dec, + " j= ", j, std::hex, "(", j, ")\n", std::dec, + " height=", height, std::hex, "(", height, ")\n", + std::dec, " width= ", width, std::hex, "(", width, + ")\n", std::dec, " alpha= ", alpha ); + ) + + // Update Y + const T* XBuffer = reinterpret_cast ( head ); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int colShift = Shift( myRow, colAlign, r ); const Int rowShift = Shift( myCol, rowAlign, c ); - const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); const Int iLocalOffset = Length( i, Y.ColShift(), r ); const Int jLocalOffset = Length( j, Y.RowShift(), c ); - for( Int t=0; t -void AxpyInterface::HandleGlobalToLocalRequest() + +template +void AxpyInterface ::HandleGlobalToLocalRequest() { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::HandleGlobalToLocalRequest")) - const DistMatrix& X = *globalToLocalMat_; + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::HandleGlobalToLocalRequest" ) ) + const DistMatrix & X = *globalToLocalMat_; const Grid& g = X.Grid(); const Int r = g.Height(); const Int c = g.Width(); const Int myRow = g.Row(); const Int myCol = g.Col(); - mpi::Status status; + if( mpi::IProbe( mpi::ANY_SOURCE, DATA_REQUEST_TAG, g.VCComm(), status ) ) { // Request exists, so recv const Int source = status.MPI_SOURCE; - const Int recvSize = 4*sizeof(Int); + const Int recvSize = 4 * sizeof( Int ); recvVector_.resize( recvSize ); byte* recvBuffer = recvVector_.data(); - mpi::TaggedRecv + + mpi::TaggedRecv ( recvBuffer, recvSize, source, DATA_REQUEST_TAG, g.VCComm() ); - - // Extract the header + + // Extract the header const byte* recvHead = recvBuffer; - const Int i = *reinterpret_cast(recvHead); - recvHead += sizeof(Int); - const Int j = *reinterpret_cast(recvHead); - recvHead += sizeof(Int); - const Int height = *reinterpret_cast(recvHead); - recvHead += sizeof(Int); - const Int width = *reinterpret_cast(recvHead); - recvHead += sizeof(Int); - - const Int colAlign = (X.ColAlign()+i) % r; - const Int rowAlign = (X.RowAlign()+j) % c; + const Int i = *reinterpret_cast ( recvHead ); + recvHead += sizeof( Int ); + const Int j = *reinterpret_cast ( recvHead ); + recvHead += sizeof( Int ); + const Int height = *reinterpret_cast ( recvHead ); + recvHead += sizeof( Int ); + const Int width = *reinterpret_cast ( recvHead ); + recvHead += sizeof( Int ); + + const Int colAlign = ( X.ColAlign() + i ) % r; + const Int rowAlign = ( X.RowAlign() + j ) % c; const Int colShift = Shift( myRow, colAlign, r ); const Int rowShift = Shift( myCol, rowAlign, c ); - const Int iLocalOffset = Length( i, X.ColShift(), r ); const Int jLocalOffset = Length( j, X.RowShift(), c ); const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); - const Int numEntries = localHeight*localWidth; - - const Int bufferSize = 2*sizeof(Int) + numEntries*sizeof(T); - const Int index = - ReadyForSend - ( bufferSize, replyVectors_[source], - replySendRequests_[source], sendingReply_[source] ); - + const Int numEntries = localHeight * localWidth; + const Int bufferSize = 2 * sizeof( Int ) + numEntries * sizeof( T ); + const Int index = ReadyForSend( bufferSize, replyVectors_[source], + replySendRequests_[source], + sendingReply_[source] ); // Pack the reply header byte* sendBuffer = replyVectors_[source][index].data(); byte* sendHead = sendBuffer; - *reinterpret_cast(sendHead) = myRow; sendHead += sizeof(Int); - *reinterpret_cast(sendHead) = myCol; sendHead += sizeof(Int); - + *reinterpret_cast ( sendHead ) = myRow; + sendHead += sizeof( Int ); + *reinterpret_cast ( sendHead ) = myCol; + sendHead += sizeof( Int ); // Pack the payload - T* sendData = reinterpret_cast(sendHead); - for( Int t=0; t( sendHead ); + + for( Int t = 0; t < localWidth; ++t ) { - T* sendCol = &sendData[t*localHeight]; - const T* XCol = X.LockedBuffer(iLocalOffset,jLocalOffset+t); + T* sendCol = &sendData[t * localHeight]; + const T* XCol = X.LockedBuffer( iLocalOffset, jLocalOffset + t ); MemCopy( sendCol, XCol, localHeight ); } // Fire off non-blocking send mpi::TaggedISSend - ( sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm(), + ( sendBuffer, bufferSize, source, DATA_REPLY_TAG, g.VCComm(), replySendRequests_[source][index] ); } } template AxpyInterface::AxpyInterface() -: attachedForLocalToGlobal_(false), attachedForGlobalToLocal_(false), - localToGlobalMat_(0), globalToLocalMat_(0), - sendDummy_(0), recvDummy_(0) + : attachedForLocalToGlobal_( false ), attachedForGlobalToLocal_( false ), + localToGlobalMat_( 0 ), globalToLocalMat_( 0 ), + sendDummy_( 0 ), recvDummy_( 0 ) { } template AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) -: sendDummy_(0), recvDummy_(0) + : sendDummy_( 0 ), recvDummy_( 0 ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::AxpyInterface" ) ) + if( type == LOCAL_TO_GLOBAL ) { attachedForLocalToGlobal_ = true; @@ -292,33 +296,34 @@ AxpyInterface::AxpyInterface( AxpyType type, DistMatrix& Z ) } const Int p = Z.Grid().Size(); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +#else sentEomTo_.resize( p, false ); haveEomFrom_.resize( p, false ); + eomSendRequests_.resize( p ); +#endif sendingData_.resize( p ); sendingRequest_.resize( p ); sendingReply_.resize( p ); - - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); - + dataSendRequests_.resize( p ); requestSendRequests_.resize( p ); replySendRequests_.resize( p ); - - eomSendRequests_.resize( p ); + + dataVectors_.resize( p ); + requestVectors_.resize( p ); + replyVectors_.resize( p ); } template AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) -: sendDummy_(0), recvDummy_(0) + : sendDummy_( 0 ), recvDummy_( 0 ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyInterface")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::AxpyInterface" ) ) + if( type == LOCAL_TO_GLOBAL ) - { - LogicError("Cannot update a constant matrix"); - } + LogicError( "Cannot update a constant matrix" ); else { attachedForLocalToGlobal_ = false; @@ -328,58 +333,60 @@ AxpyInterface::AxpyInterface( AxpyType type, const DistMatrix& X ) } const Int p = X.Grid().Size(); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +#else sentEomTo_.resize( p, false ); haveEomFrom_.resize( p, false ); - + eomSendRequests_.resize( p ); +#endif sendingData_.resize( p ); sendingRequest_.resize( p ); sendingReply_.resize( p ); - - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); - + dataSendRequests_.resize( p ); requestSendRequests_.resize( p ); replySendRequests_.resize( p ); - - eomSendRequests_.resize( p ); + + dataVectors_.resize( p ); + requestVectors_.resize( p ); + replyVectors_.resize( p ); } -template -AxpyInterface::~AxpyInterface() -{ +template AxpyInterface ::~AxpyInterface() +{ if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) { if( std::uncaught_exception() ) { - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : - globalToLocalMat_->Grid() ); - ostringstream os; - os << g.Rank() - << "Uncaught exception detected during AxpyInterface destructor " - "that required a call to Detach. Instead of allowing for the " - "possibility of Detach throwing another exception and " - "resulting in a 'terminate', we instead immediately dump the " - "call stack (if not in RELEASE mode) since the program will " - "likely hang:" << endl; - cerr << os.str(); - DEBUG_ONLY(DumpCallStack()) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid() : + globalToLocalMat_->Grid() ); + std::ostringstream os; + os << g.Rank() + << + "Uncaught exception detected during AxpyInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY( DumpCallStack() ) } else - { - Detach(); - } + Detach(); } } -template -void AxpyInterface::Attach( AxpyType type, DistMatrix& Z ) +template +void AxpyInterface ::Attach( AxpyType type, DistMatrix & Z ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Attach")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Attach" ) ) + if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) - LogicError("Must detach before reattaching."); + LogicError( "Must detach before reattaching." ); + + const Grid& g = Z.Grid(); if( type == LOCAL_TO_GLOBAL ) { @@ -393,35 +400,40 @@ void AxpyInterface::Attach( AxpyType type, DistMatrix& Z ) } const Int p = Z.Grid().Size(); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +#else sentEomTo_.resize( p, false ); haveEomFrom_.resize( p, false ); - - sendingData_.resize( p ); + eomSendRequests_.resize( p ); +#endif + + // request objects sendingRequest_.resize( p ); + sendingData_.resize( p ); sendingReply_.resize( p ); - + + // ready-to-send + requestSendRequests_.resize( p ); + replySendRequests_.resize( p ); + dataSendRequests_.resize( p ); + + // data dataVectors_.resize( p ); requestVectors_.resize( p ); replyVectors_.resize( p ); - - dataSendRequests_.resize( p ); - requestSendRequests_.resize( p ); - replySendRequests_.resize( p ); - - eomSendRequests_.resize( p ); } -template -void AxpyInterface::Attach( AxpyType type, const DistMatrix& X ) +template +void AxpyInterface ::Attach( AxpyType type, + const DistMatrix & X ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Attach")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Attach" ) ) + if( attachedForLocalToGlobal_ || attachedForGlobalToLocal_ ) - LogicError("Must detach before reattaching."); + LogicError( "Must detach before reattaching." ); if( type == LOCAL_TO_GLOBAL ) - { - LogicError("Cannot update a constant matrix"); - } + LogicError( "Cannot update a constant matrix" ); else { attachedForGlobalToLocal_ = true; @@ -429,59 +441,70 @@ void AxpyInterface::Attach( AxpyType type, const DistMatrix& X ) } const Int p = X.Grid().Size(); +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +#else + // eom sentEomTo_.resize( p, false ); haveEomFrom_.resize( p, false ); - - sendingData_.resize( p ); + eomSendRequests_.resize( p ); +#endif + + // ready-to-send sendingRequest_.resize( p ); + sendingData_.resize( p ); sendingReply_.resize( p ); - - dataVectors_.resize( p ); - requestVectors_.resize( p ); - replyVectors_.resize( p ); - - dataSendRequests_.resize( p ); + + // ready-to-send requestSendRequests_.resize( p ); replySendRequests_.resize( p ); - - eomSendRequests_.resize( p ); + dataSendRequests_.resize( p ); + + // data + dataVectors_.resize( p ); + replyVectors_.resize( p ); + requestVectors_.resize( p ); } -template -void AxpyInterface::Axpy( T alpha, Matrix& Z, Int i, Int j ) +template +void AxpyInterface ::Axpy( T alpha, Matrix & Z, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Axpy")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Axpy" ) ) + if( attachedForLocalToGlobal_ ) AxpyLocalToGlobal( alpha, Z, i, j ); else if( attachedForGlobalToLocal_ ) AxpyGlobalToLocal( alpha, Z, i, j ); else - LogicError("Cannot axpy before attaching."); + LogicError( "Cannot axpy before attaching." ); } -template -void AxpyInterface::Axpy( T alpha, const Matrix& Z, Int i, Int j ) +template +void AxpyInterface ::Axpy( T alpha, const Matrix & Z, Int i, + Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Axpy")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Axpy" ) ) + if( attachedForLocalToGlobal_ ) AxpyLocalToGlobal( alpha, Z, i, j ); else if( attachedForGlobalToLocal_ ) - LogicError("Cannot update a constant matrix."); + LogicError( "Cannot update a constant matrix." ); else - LogicError("Cannot axpy before attaching."); + LogicError( "Cannot axpy before attaching." ); } // Update Y(i:i+height-1,j:j+width-1) += alpha X, where X is height x width -template -void AxpyInterface::AxpyLocalToGlobal -( T alpha, const Matrix& X, Int i, Int j ) +template +void AxpyInterface ::AxpyLocalToGlobal +( T alpha, const Matrix & X, Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyLocalToGlobal")) - DistMatrix& Y = *localToGlobalMat_; + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::AxpyLocalToGlobal" ) ) + DistMatrix & Y = *localToGlobalMat_; + if( i < 0 || j < 0 ) - LogicError("Submatrix offsets must be non-negative"); - if( i+X.Height() > Y.Height() || j+X.Width() > Y.Width() ) - LogicError("Submatrix out of bounds of global matrix"); + LogicError( "Submatrix offsets must be non-negative" ); + + if( i + X.Height() > Y.Height() || j + X.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); const Grid& g = Y.Grid(); const Int r = g.Height(); @@ -489,80 +512,89 @@ void AxpyInterface::AxpyLocalToGlobal const Int p = g.Size(); const Int myProcessRow = g.Row(); const Int myProcessCol = g.Col(); - const Int colAlign = (Y.ColAlign() + i) % r; - const Int rowAlign = (Y.RowAlign() + j) % c; - + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; const Int height = X.Height(); const Int width = X.Width(); - Int receivingRow = myProcessRow; Int receivingCol = myProcessCol; - for( Int step=0; step(head) = i; head += sizeof(Int); - *reinterpret_cast(head) = j; head += sizeof(Int); - *reinterpret_cast(head) = height; head += sizeof(Int); - *reinterpret_cast(head) = width; head += sizeof(Int); - *reinterpret_cast(head) = alpha; head += sizeof(T); - - // Pack the payload - T* sendData = reinterpret_cast(head); + *reinterpret_cast ( head ) = i; + head += sizeof( Int ); + *reinterpret_cast ( head ) = j; + head += sizeof( Int ); + *reinterpret_cast ( head ) = height; + head += sizeof( Int ); + *reinterpret_cast ( head ) = width; + head += sizeof( Int ); + *reinterpret_cast ( head ) = alpha; + head += sizeof( T ); + + // Pack the payload + T* sendData = reinterpret_cast ( head ); const T* XBuffer = X.LockedBuffer(); const Int XLDim = X.LDim(); - for( Int t=0; t -void AxpyInterface::AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ) +template +void AxpyInterface ::AxpyGlobalToLocal( T alpha, Matrix & Y, + Int i, Int j ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::AxpyGlobalToLocal")) - const DistMatrix& X = *globalToLocalMat_; - + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::AxpyGlobalToLocal" ) ) + const DistMatrix & X = *globalToLocalMat_; const Int height = Y.Height(); const Int width = Y.Width(); - if( i+height > X.Height() || j+width > X.Width() ) - LogicError("Invalid AxpyGlobalToLocal submatrix"); + + if( i + height > X.Height() || j + width > X.Width() ) + LogicError( "Invalid AxpyGlobalToLocal submatrix" ); const Grid& g = X.Grid(); const Int r = g.Height(); @@ -570,71 +602,75 @@ void AxpyInterface::AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ) const Int p = g.Size(); // Send out the requests to all processes in the grid - for( Int rank=0; rank(head) = i; head += sizeof(Int); - *reinterpret_cast(head) = j; head += sizeof(Int); - *reinterpret_cast(head) = height; head += sizeof(Int); - *reinterpret_cast(head) = width; head += sizeof(Int); + *reinterpret_cast ( head ) = i; + head += sizeof( Int ); + *reinterpret_cast ( head ) = j; + head += sizeof( Int ); + *reinterpret_cast ( head ) = height; + head += sizeof( Int ); + *reinterpret_cast ( head ) = width; + head += sizeof( Int ); // Begin the non-blocking send mpi::TaggedISSend - ( sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm(), + ( sendBuffer, bufferSize, rank, DATA_REQUEST_TAG, g.VCComm(), requestSendRequests_[rank][index] ); } // Receive all of the replies Int numReplies = 0; + while( numReplies < p ) { HandleGlobalToLocalRequest(); - mpi::Status status; - if( mpi::IProbe( mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm(), status ) ) + + if( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_REPLY_TAG, g.VCComm(), status ) ) { const Int source = status.MPI_SOURCE; - // Ensure that we have a recv buffer - const Int count = mpi::GetCount( status ); + const Int count = mpi::GetCount ( status ); recvVector_.resize( count ); byte* recvBuffer = recvVector_.data(); - - // Receive the data + + // Receive the data mpi::TaggedRecv ( recvBuffer, count, source, DATA_REPLY_TAG, g.VCComm() ); - - // Unpack the reply header + + // Unpack the reply header const byte* head = recvBuffer; - const Int row = *reinterpret_cast(head); - head += sizeof(Int); - const Int col = *reinterpret_cast(head); - head += sizeof(Int); - const T* recvData = reinterpret_cast(head); - - // Compute the local heights and offsets - const Int colAlign = (X.ColAlign()+i) % r; - const Int rowAlign = (X.RowAlign()+j) % c; + const Int row = *reinterpret_cast ( head ); + head += sizeof( Int ); + const Int col = *reinterpret_cast ( head ); + head += sizeof( Int ); + const T* recvData = reinterpret_cast ( head ); + + // Compute the local heights and offsets + const Int colAlign = ( X.ColAlign() + i ) % r; + const Int rowAlign = ( X.RowAlign() + j ) % c; const Int colShift = Shift( row, colAlign, r ); const Int rowShift = Shift( col, rowAlign, c ); const Int localHeight = Length( height, colShift, r ); const Int localWidth = Length( width, rowShift, c ); // Unpack the local matrix - for( Int t=0; t::AxpyGlobalToLocal( T alpha, Matrix& Y, Int i, Int j ) } } -template -Int AxpyInterface::ReadyForSend +template +Int AxpyInterface ::ReadyForSend ( Int sendSize, - deque>& sendVectors, - deque& requests, - deque& requestStatuses ) + std::deque >& sendVectors, + std::deque & requests, + std::deque & requestStatuses ) { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::ReadyForSend")) + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::ReadyForSend" ) ) const Int numCreated = sendVectors.size(); - DEBUG_ONLY( - if( numCreated != Int(requests.size()) || - numCreated != Int(requestStatuses.size()) ) - LogicError("size mismatch"); - ) - for( Int i=0; i -void AxpyInterface::UpdateRequestStatuses() +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +template bool AxpyInterface ::ReturnRequestStatuses() { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::UpdateRequestStatuses")) - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::ReturnRequestStatuses" ) ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid() : globalToLocalMat_->Grid() ); const Int p = g.Size(); - for( Int i=0; i -void AxpyInterface::Detach() + return true; +} +#else +template void AxpyInterface ::UpdateRequestStatuses() { - DEBUG_ONLY(CallStackEntry cse("AxpyInterface::Detach")) - if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) - LogicError("Must attach before detaching."); - - const Grid& g = ( attachedForLocalToGlobal_ ? - localToGlobalMat_->Grid() : + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::UpdateRequestStatuses" ) ) + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid() : globalToLocalMat_->Grid() ); + const Int p = g.Size(); + + for( Int i = 0; i < p; ++i ) + { + const Int numDataSendRequests = dataSendRequests_[i].size(); + + for( Int j = 0; j < numDataSendRequests; ++j ) + if( sendingData_[i][j] ) + sendingData_[i][j] = !mpi::Test( dataSendRequests_[i][j] ); + + const Int numRequestSendRequests = requestSendRequests_[i].size(); + + for( Int j = 0; j < numRequestSendRequests; ++j ) + if( sendingRequest_[i][j] ) + sendingRequest_[i][j] = !mpi::Test( requestSendRequests_[i][j] ); + + const Int numReplySendRequests = replySendRequests_[i].size(); + + for( Int j = 0; j < numReplySendRequests; ++j ) + if( sendingReply_[i][j] ) + sendingReply_[i][j] = !mpi::Test( replySendRequests_[i][j] ); + } +} +#endif //EL_USE_IBARRIER_FOR_AXPY + +template void AxpyInterface ::Detach() +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::Detach" ) ) + if( !attachedForLocalToGlobal_ && !attachedForGlobalToLocal_ ) + LogicError( "Must attach before detaching." ); + + const Grid& g = ( attachedForLocalToGlobal_ ? + localToGlobalMat_->Grid() : globalToLocalMat_-> + Grid() ); + // nonblocking consensus +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) + bool DONE = false; + mpi::Request nb_bar_request; + bool nb_bar_active = false; + + while( !DONE ) +#else while( !Finished() ) +#endif { if( attachedForLocalToGlobal_ ) HandleLocalToGlobalData(); else HandleGlobalToLocalRequest(); + +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) + + if( nb_bar_active ) + DONE = mpi::Test( nb_bar_request ); + else + { + if( ReturnRequestStatuses() ) + { + // all ssends are complete, start nonblocking barrier + mpi::IBarrier( g.VCComm(), nb_bar_request ); + nb_bar_active = true; + } + } +#else HandleEoms(); +#endif } mpi::Barrier( g.VCComm() ); - + attachedForLocalToGlobal_ = false; attachedForGlobalToLocal_ = false; - recvVector_.clear(); + +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +#else sentEomTo_.clear(); haveEomFrom_.clear(); - + eomSendRequests_.clear(); +#endif + sendingData_.clear(); sendingRequest_.clear(); sendingReply_.clear(); @@ -745,12 +858,11 @@ void AxpyInterface::Detach() dataVectors_.clear(); requestVectors_.clear(); replyVectors_.clear(); + recvVector_.clear(); dataSendRequests_.clear(); requestSendRequests_.clear(); replySendRequests_.clear(); - - eomSendRequests_.clear(); } #define PROTO(T) template class AxpyInterface; diff --git a/src/core/AxpyInterface2.0.cpp b/src/core/AxpyInterface2.0.cpp new file mode 100644 index 0000000000..5c0f8452b9 --- /dev/null +++ b/src/core/AxpyInterface2.0.cpp @@ -0,0 +1,2028 @@ +/* +This file is part of Elemental and is under the BSD 2-Clause License, +which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +#include "El.hpp" +#include + +namespace El +{ +template +AxpyInterface2::AxpyInterface2() + : GlobalArrayPut_( 0 ), GlobalArrayGet_( 0 ), + matrices_( 0 ), coords_( 0 ), dataVectors_( 0 ), + toBeAttachedForGet_( false ), toBeAttachedForPut_( false ), + attached_( false ), detached_( true ) +{ } + +template +AxpyInterface2::AxpyInterface2( DistMatrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::AxpyInterface2" ) ) + attached_ = false; + detached_ = true; + toBeAttachedForGet_ = false; + toBeAttachedForPut_ = false; + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; +} + +template +AxpyInterface2::AxpyInterface2( const DistMatrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::AxpyInterface2" ) ) + + attached_ = false; + detached_ = true; + + toBeAttachedForGet_ = false; + toBeAttachedForPut_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; +} + +template +AxpyInterface2::~AxpyInterface2() +{ + if( std::uncaught_exception() ) + { + std::ostringstream os; + os << "Uncaught exception detected during AxpyInterface2 destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY( DumpCallStack() ) + } + else + Detach(); +} + +template +Int AxpyInterface2::NextIndexData( + Int target, + Int dataSize, + const void* base_address, + Int* mindex ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::NextIndexData" ) ) + assert( base_address != NULL ); + Int matrixIndex = 0; + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + const Int numMatrices = matrices_.size(); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + // uninitiated, first time + if( matrices_[m].base_ == NULL ) + { + matrices_[m].base_ = base_address; + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // need to create new object + if( matrixIndex == numMatrices ) + { + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + matrices_[matrixIndex].base_ = base_address; + } + + // go through the request, data, + // status objects + const Int numCreated = matrices_[matrixIndex].data_[target].size(); + + DEBUG_ONLY( if( numCreated != Int( matrices_[matrixIndex].requests_[target].size() ) + || numCreated != Int( matrices_[matrixIndex].statuses_[target].size() ) ) + LogicError( "size mismatch" ); ) + + for( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, + // test to see if it finished. + if( matrices_[matrixIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test( matrices_[matrixIndex].requests_[target][i] ); + matrices_[matrixIndex].statuses_[target][i] = !finished; + } + + if( !matrices_[matrixIndex].statuses_[target][i] ) + { + matrices_[matrixIndex].statuses_[target][i] = true; + matrices_[matrixIndex].data_[target][i].resize( dataSize ); + *mindex = matrixIndex; + return i; + } + } + + matrices_[matrixIndex].data_[target].resize( numCreated + 1 ); + matrices_[matrixIndex].data_[target][numCreated].resize( dataSize ); + matrices_[matrixIndex].requests_[target].push_back( mpi::REQUEST_NULL ); + matrices_[matrixIndex].statuses_[target].push_back( true ); + *mindex = matrixIndex; + return numCreated; +} + +template +Int AxpyInterface2::NextIndexCoord( + Int i, Int j, + Int target, + const void* base_address, + Int* cindex ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::NextIndexCoord" ) ) + assert( base_address != NULL ); + Int coordIndex = 0; + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + const Int numCoords = coords_.size(); + + // search for matrix base + for( Int m = 0; m < numCoords; m++ ) + { + if( coords_[m].base_ == base_address ) + { + coordIndex = m; + break; + } + + if( coords_[m].base_ == NULL ) + { + coords_[m].base_ = base_address; + coordIndex = m; + break; + } + + coordIndex = m+1; + } + + // need to create new object + if( coordIndex == numCoords ) + { + struct coord_params_ cp; + cp.coord_.resize( p ); + cp.requests_.resize( p ); + cp.statuses_.resize( p ); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + coords_[coordIndex].base_ = base_address; + } + + // go through the request, data, + // status objects + const Int numCreated = coords_[coordIndex].coord_[target].size(); + + DEBUG_ONLY( if( numCreated != Int( coords_[coordIndex].requests_[target].size() ) + || numCreated != Int( matrices_[coordIndex].statuses_[target].size() ) ) + LogicError( "size mismatch" ); ) + + for( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, + // test to see if it finished. + if( coords_[coordIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test( coords_[coordIndex].requests_[target][i] ); + coords_[coordIndex].statuses_[target][i] = !finished; + } + + if( !coords_[coordIndex].statuses_[target][i] ) + { + coords_[coordIndex].statuses_[target][i] = true; + coords_[coordIndex].coord_[target][i][0] = i; + coords_[coordIndex].coord_[target][i][1] = j; + *cindex = coordIndex; + return i; + } + } + + coords_[coordIndex].coord_[target].resize( numCreated + 1 ); + coords_[coordIndex].coord_[target][numCreated][0] = i; + coords_[coordIndex].coord_[target][numCreated][1] = j; + coords_[coordIndex].requests_[target].push_back( mpi::REQUEST_NULL ); + coords_[coordIndex].statuses_[target].push_back( true ); + *cindex = coordIndex; + return numCreated; +} + +template +void AxpyInterface2::Attach( DistMatrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Attach" ) ) + + // attached_ will be only set in Attach + // and only unset in Detach + if( !attached_ && detached_ ) + { + attached_ = true; + detached_ = false; + } + else + LogicError( "Must detach before reattaching." ); + + const Grid& g = Z.Grid(); + const Int p = g.Size(); + + // the matrix base_ is not known until + // an update operation (put/get/acc) + // so it is kept blank + // if DistMatrix is non-const, all one-sided + // transfers -- put, get and acc are possible + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + { + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + + if( dataVectors_.empty() ) + dataVectors_.resize( p ); + + if( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize( p ); + cp.requests_.resize( p ); + cp.statuses_.resize( p ); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + } + } + + mpi::Barrier( g.VCComm() ); +} + +template +void AxpyInterface2::Attach( const DistMatrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Attach" ) ) + + // attached_ will be only set in Attach + // and only unset in Detach + if( !attached_ && detached_ ) + { + attached_ = true; + detached_ = false; + } + else + LogicError( "Must detach before reattaching." ); + + const Grid& g = Z.Grid(); + const Int p = g.Size(); + + // the matrix base_ is not known until + // an update operation (put/get/acc) + // so it is kept blank + // if DistMatrix is non-const, all one-sided + // transfers -- put, get and acc are possible + if( !toBeAttachedForGet_ ) + { + GlobalArrayPut_ = 0; + toBeAttachedForPut_ = false; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + + if( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if( coords_.empty() ) + { + struct coord_params_ cp; + cp.coord_.resize( p ); + cp.requests_.resize( p ); + cp.statuses_.resize( p ); + cp.base_ = NULL; + // push back new matrix_params created + // with default constructor + coords_.push_back( cp ); + } + } + + mpi::Barrier( g.VCComm() ); +} + +// end-to-end blocking put/acc routines +template +void AxpyInterface2::Eput( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Eput" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int XLDim = Z.LDim(); + const Int height = Z.Height(); + const Int width = Z.Width(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int YLDim = Y.LDim(); + Int matrix_index, coord_index; + + // data/coord send + for( Int step=0; step 0 ) + { + const Int destination = receivingRow + r*receivingCol; + // data + const Int dindex = + NextIndexData( destination, + numEntries, + Buffer, + &matrix_index ); + + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][dindex].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data(); + + for( Int t=0; t( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + // post receive for coordinates + mpi::TaggedISend( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; + } + + // poke + Test( Z ); + // data/coord receive + std::vector recvVector_; + + for( Int step=0; step +void AxpyInterface2::Eput( Matrix& Z, Int i, Int j ) +{ Eput( const_cast&>( Z ), i, j ); } + +// end to end blocking routines +template +void AxpyInterface2::Eacc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Eacc" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int XLDim = Z.LDim(); + const Int height = Z.Height(); + const Int width = Z.Width(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int YLDim = Y.LDim(); + // data/coord receive + std::vector recvVector_; + Int matrix_index, coord_index; + + // data/coord send + for( Int step=0; step 0 ) + { + const Int destination = receivingRow + r*receivingCol; + // data + const Int dindex = + NextIndexData( destination, + numEntries, + Buffer, + &matrix_index ); + + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[destination][dindex].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + + T* sendBuffer = matrices_[matrix_index].data_[destination][dindex].data(); + + for( Int t=0; t( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + // post receive for coordinates + mpi::TaggedISend( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; + } + + // test for requests + Test( Z ); + + for( Int step=0; step +void AxpyInterface2::Eacc( Matrix& Z, Int i, Int j ) +{ Eacc( const_cast&>( Z ), i, j ); } + +template +void AxpyInterface2::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Get" ) ) + + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if( !toBeAttachedForGet_ ) + LogicError( "Cannot perform this operation as matrix is not attached." ); + + const DistMatrix& X = *GlobalArrayGet_; + const Int height = Z.Height(); + const Int width = Z.Width(); + + if( i + height > X.Height() || j + width > X.Width() ) + LogicError( "Invalid submatrix for Iget" ); + + T* XBuffer = Z.Buffer(); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + const Grid& g = X.Grid(); + const Int p = g.Size(); + const Int r = g.Height(); + const Int c = g.Width(); + Int coord_index; + std::vector recvVector_; + + // Send out the requests to all processes in the grid + for( Int rank = 0; rank < p; ++rank ) + { + const Int cindex = + NextIndexCoord( i, j, + rank, + Buffer, + &coord_index ); + Int* coord = reinterpret_cast( coords_[coord_index].coord_[rank][cindex].data() ); + coord[0] = i; + coord[1] = j; + coord[2] = -1; + mpi::TaggedISend( coord, 3, rank, + REQUEST_GET_TAG, g.VCComm(), + coords_[coord_index].requests_[rank][cindex] ); + } + + // Receive all of the replies + Int numReplies = 0; + + while( numReplies < p ) + { + mpi::Status status; + HandleGlobalToLocalData( Z ); + + if( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm(), status ) ) + { + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount ( status ); + recvVector_.resize( count ); + T* recvBuffer = recvVector_.data(); + + // Receive the data + mpi::TaggedRecv + ( recvBuffer, count, source, DATA_GET_TAG, g.VCComm() ); + + // Compute the local heights and offsets + const Int myRow = g.Row(); + const Int myCol = g.Col(); + const Int colAlign = ( X.ColAlign() + i ) % r; + const Int rowAlign = ( X.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + + // Unpack the local matrix + for( Int t = 0; t < localWidth; ++t ) + { + //T *YCol = X.Buffer (0, rowShift + t * c); + T* YCol = Z.Buffer( 0, rowShift + t * c ); + const T* XCol = &recvBuffer[t * localHeight]; + + for( Int s = 0; s < localHeight; ++s ) + YCol[colShift + s * r] = XCol[s]; + } + + ++numReplies; + recvVector_.clear(); + } + } +} + +// nonblocking, no local completion +template +void AxpyInterface2::Iput( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Iput" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + Int matrix_index, coord_index; + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int YLDim = Y.LDim(); + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + + for( Int step=0; step( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + // post receive for coordinates + mpi::TaggedISend( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; + } +} + +template +void AxpyInterface2::Iput( Matrix& Z, Int i, Int j ) +{ Iput( const_cast&>( Z ), i, j ); } + +template +void AxpyInterface2::Iget( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Iget" ) ) + + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if( !toBeAttachedForGet_ ) + LogicError( "Cannot perform this operation as matrix is not attached." ); + + const DistMatrix& X = *GlobalArrayGet_; + const Int height = Z.Height(); + const Int width = Z.Width(); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + Int coord_index; + + if( i + height > X.Height() || j + width > X.Width() ) + LogicError( "Invalid submatrix for Iget" ); + + const Grid& g = X.Grid(); + const Int p = g.Size(); + + // Send out the requests to all processes in the grid + for( Int rank = 0; rank < p; ++rank ) + { + // send coordinates + const Int cindex = + NextIndexCoord( i, j, + rank, + Buffer, + &coord_index ); + Int* coord_ = reinterpret_cast( coords_[coord_index].coord_[rank][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = -1; + // post receive for coordinates + mpi::TaggedISend( coord_, 3, rank, + REQUEST_GET_TAG, g.VCComm(), + coords_[coord_index].requests_[rank][cindex] ); + } +} + +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width +template +void AxpyInterface2::Iacc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Iacc" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + Int matrix_index, coord_index; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int YLDim = Y.LDim(); + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + + for( Int step=0; step( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + mpi::TaggedISend( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; + } +} + +template +void AxpyInterface2::Iacc( Matrix& Z, Int i, Int j ) +{ Iacc( const_cast&>( Z ), i, j ); } + +// nonblocking, local completion +template +void AxpyInterface2::Put( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Put" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + Int matrix_index, coord_index; + + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int YLDim = Y.LDim(); + + // copy local matrix buffer + const Int my_rank = g.VCRank(); + const Int numCreated = dataVectors_[my_rank].size(); + dataVectors_[my_rank].resize( numCreated + 1 ); + dataVectors_[my_rank][numCreated].resize( width * height ); + + const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); + T* ZBuffer = reinterpret_cast ( dataVectors_[my_rank][numCreated].data() ); + MemCopy( ZBuffer, reinterpret_cast ( Buffer ), + height * width ); + T* XBuffer = reinterpret_cast ( ZBuffer ); + + for( Int step=0; step( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + mpi::TaggedISend( coord_, 3, destination, + COORD_PUT_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; + } +} + +template +void AxpyInterface2::Put( Matrix& Z, Int i, Int j ) +{ Put( const_cast&>( Z ), i, j ); } + +// input buffer could be modified upon exit +// from this function +template +void AxpyInterface2::Acc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Acc" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + Int matrix_index, coord_index; + + //do boundary checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int YLDim = Y.LDim(); + + // copy local matrix buffer + const Int my_rank = g.VCRank(); + const Int numCreated = dataVectors_[my_rank].size(); + dataVectors_[my_rank].resize( numCreated + 1 ); + dataVectors_[my_rank][numCreated].resize( width * height ); + + const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); + T* ZBuffer = reinterpret_cast ( dataVectors_[my_rank][numCreated].data() ); + MemCopy( ZBuffer, reinterpret_cast ( Buffer ), + height * width ); + T* XBuffer = reinterpret_cast ( ZBuffer ); + + for( Int step=0; step( coords_[coord_index].coord_[destination][cindex].data() ); + coord_[0] = i; + coord_[1] = j; + coord_[2] = numEntries; + mpi::TaggedISend( coord_, 3, destination, + COORD_ACC_TAG, g.VCComm(), + coords_[coord_index].requests_[destination][cindex] ); + } + + receivingRow = ( receivingRow + 1 ) % r; + + if( receivingRow == 0 ) + receivingCol = ( receivingCol + 1 ) % c; + } +} + +template +void AxpyInterface2::Acc( Matrix& Z, Int i, Int j ) +{ Acc( const_cast&>( Z ), i, j ); } + +// waitany implementation +// cannot use mpi::Waitany +// as of now because request +// objects are vector of deques +template +void AxpyInterface2::WaitAny( const Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::WaitAny" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // search for matrix base in coords + for( Int c = 0; c < numCoords; c++ ) + { + if( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + + coordIndex = c+1; + } + + // matrix not found + if( matrixIndex == numMatrices && + coordIndex == numCoords ) + return; + + // data + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + if( !matrices_[matrixIndex].statuses_[rank][i] ) + { + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + return; + } + } + } + + // coordinates + for( int rank = 0; rank < p; ++rank ) + { + if( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); + + for( int i = 0; i < numCoordStatuses; i++ ) + { + if( !coords_[coordIndex].statuses_[rank][i] ) + { + mpi::Wait( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; + return; + } + } + } +} + +template +void AxpyInterface2::WaitAny( Matrix& Z ) +{ WaitAny( const_cast&>( Z ) ); } + +template +void AxpyInterface2::Wait( const Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Wait" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // search for matrix base in coords + for( Int c = 0; c < numCoords; c++ ) + { + if( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + + coordIndex = c+1; + } + + // matrix not found + if( matrixIndex == numMatrices && + coordIndex == numCoords ) + return; + + // data + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } + + // coordinates + for( int rank = 0; rank < p; ++rank ) + { + if( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); + + for( int i = 0; i < numCoordStatuses; i++ ) + { + mpi::Wait( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; + } + } +} + +template +void AxpyInterface2::Wait( Matrix& Z ) +{ Wait( const_cast&>( Z ) ); } + +template +void AxpyInterface2::Waitall() +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Waitall" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + + // data + for( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + { + for( int rank = 0; rank < p; ++rank ) + { + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } + } + + // coordinates + for( int coordIndex = 0; coordIndex < numCoords; ++coordIndex ) + { + for( int rank = 0; rank < p; ++rank ) + { + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); + + for( int i = 0; i < numCoordStatuses; i++ ) + { + mpi::Wait( coords_[coordIndex].requests_[rank][i] ); + coords_[coordIndex].statuses_[rank][i] = true; + } + } + } +} + +template +bool AxpyInterface2::Test( const Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Test" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // search for matrix base in coords + for( Int c = 0; c < numCoords; c++ ) + { + if( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + + coordIndex = c+1; + } + + // matrix not found + if( matrixIndex == numMatrices && + coordIndex == numCoords ) + return true; + + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); + + if( matrices_[matrixIndex].statuses_[rank][i] ) + return false; + } + } + + for( int rank = 0; rank < p; ++rank ) + { + if( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); + + for( int i = 0; i < numCoordStatuses; i++ ) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test( coords_[coordIndex].requests_[rank][i] ); + + if( coords_[coordIndex].statuses_[rank][i] ) + return false; + } + } + + return true; +} + +template +bool AxpyInterface2::Test( Matrix& Z ) +{ return Test( const_cast&>( Z ) ); } + +template +bool AxpyInterface2::TestAny( const Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::TestAny" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex, coordIndex; + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // search for matrix base in coords + for( Int c = 0; c < numCoords; c++ ) + { + if( coords_[c].base_ == base_address ) + { + coordIndex = c; + break; + } + + coordIndex = c+1; + } + + // matrix not found + if( matrixIndex == numMatrices && + coordIndex == numCoords ) + return true; + + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); + + if( matrices_[matrixIndex].statuses_[rank][i] ) + continue; + else + return true; + } + } + + for( int rank = 0; rank < p; ++rank ) + { + if( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); + + for( int i = 0; i < numCoordStatuses; i++ ) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test( coords_[coordIndex].requests_[rank][i] ); + + if( coords_[coordIndex].statuses_[rank][i] ) + continue; + else + return true; + } + } + + return false; +} + +template +bool AxpyInterface2::TestAny( Matrix& Z ) +{ return TestAny( const_cast&>( Z ) ); } + +template +bool AxpyInterface2::Testall() +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Testall" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + const Int numMatrices = matrices_.size(); + const Int numCoords = coords_.size(); + + // data + for( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + { + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); + + if( matrices_[matrixIndex].statuses_[rank][i] ) + return false; + } + } + } + + // coordinates + for( int coordIndex = 0; coordIndex < numCoords; ++coordIndex ) + { + for( int rank = 0; rank < p; ++rank ) + { + if( coords_[coordIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numCoordStatuses = coords_[coordIndex].requests_[rank].size(); + + for( int i = 0; i < numCoordStatuses; i++ ) + { + coords_[coordIndex].statuses_[rank][i] = + !mpi::Test( coords_[coordIndex].requests_[rank][i] ); + + if( coords_[coordIndex].statuses_[rank][i] ) + return false; + } + } + } + + return true; +} + +// This is non-collective flush +// This will ensure local+remote completion +// if Z is const then only Put/Acc is possible +template +void AxpyInterface2::Flush( const Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Flush" ) ) + + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer before flushing." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + bool DONE = false; + mpi::Status status; + + while( !DONE ) + { + if( mpi::IProbe( mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm(), status ) ) + { + switch( status.MPI_TAG ) + { + case DATA_PUT_TAG: + { + HandleLocalToGlobalData( Z, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc( Z, status.MPI_SOURCE ); + break; + } + } + } + + // wait for requests to + // complete one by one + WaitAny( Z ); + DONE = Test( Z ); + } +} + +template +void AxpyInterface2::Flush( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Flush" ) ) + + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer before flushing." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + bool DONE = false; + mpi::Status status; + + while( !DONE ) + { + if( mpi::IProbe( mpi::ANY_SOURCE, mpi::ANY_TAG, g.VCComm(), status ) ) + { + switch( status.MPI_TAG ) + { + case DATA_PUT_TAG: + { + HandleLocalToGlobalData( Z, status.MPI_SOURCE ); + break; + } + case DATA_ACC_TAG: + { + HandleLocalToGlobalAcc( Z, status.MPI_SOURCE ); + break; + } + case REQUEST_GET_TAG: + { + HandleGlobalToLocalData( Z ); + break; + } + } + } + + // wait for requests to + // complete one by one + WaitAny( Z ); + DONE = Test( Z ); + } +} + +template +void AxpyInterface2::HandleLocalToGlobalData( const Matrix& Z, Int source ) +{ + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); + int height = Z.Height(); + int width = Z.Width(); + + // post receive for coordinates + Int coord[3]; + mpi::TaggedRecv( coord, 3, source, + COORD_PUT_TAG, g.VCComm() ); + Int i = coord[0]; + Int j = coord[1]; + Int count = coord[2]; + // data vector + std::vector getVector_; + getVector_.resize( count ); + + DEBUG_ONLY( if( count < Int( sizeof( T ) ) ) + LogicError( "Count was too small" ); ) + DEBUG_ONLY( if( Int( getVector_.size() ) != count ) + LogicError( "Not enough space allocated" ); ) + + // post receive for data + T* getBuffer = getVector_.data(); + mpi::TaggedRecv( getBuffer, count, source, + DATA_PUT_TAG, g.VCComm() ); + + // Update Y + const T* XBuffer = const_cast ( getBuffer ); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + + for( Int t = 0; t < localWidth; ++t ) + { + T* YCol = Y.Buffer( iLocalOffset, jLocalOffset + t ); + const T* XCol = &XBuffer[t * localHeight]; + MemCopy( YCol, XCol, localHeight ); + } + + // Free the memory + getVector_.clear(); +} + +template +void AxpyInterface2::HandleLocalToGlobalData( Matrix& Z, Int source ) +{ HandleLocalToGlobalData( const_cast&>( Z ), source ); } + +// replica of above function except this accumulates +template +void AxpyInterface2::HandleLocalToGlobalAcc( const Matrix& Z, Int source ) +{ + DistMatrix& Y = *GlobalArrayPut_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); + const int height = Z.Height(); + const int width = Z.Width(); + + // post receive for coordinates + Int coord[3]; + mpi::TaggedRecv( coord, 3, source, + COORD_ACC_TAG, g.VCComm() ); + Int i = coord[0]; + Int j = coord[1]; + Int count = coord[2]; + // data buffer + std::vector getVector_; + getVector_.resize( count ); + + DEBUG_ONLY( if( count < Int( sizeof( T ) ) ) + LogicError( "Count was too small" ); ) + DEBUG_ONLY( if( Int( getVector_.size() ) != count ) + LogicError( "Not enough space allocated" ); ) + + // post receive for data + T* getBuffer = getVector_.data(); + mpi::TaggedRecv( getBuffer, count, source, + DATA_ACC_TAG, g.VCComm() ); + // Update Y + const T* XBuffer = const_cast ( getBuffer ); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + + for( Int t = 0; t < localWidth; ++t ) + { + T* YCol = Y.Buffer( iLocalOffset, jLocalOffset + t ); + const T* XCol = &XBuffer[t * localHeight]; + + for( Int s = 0; s < localHeight; ++s ) + YCol[s] += XCol[s]; + } + + // Free the memory + getVector_.clear(); +} + +template +void AxpyInterface2::HandleLocalToGlobalAcc( Matrix& Z, Int source ) +{ HandleLocalToGlobalAcc( const_cast&>( Z ), source ); } + +// handle request for data, post a matching isend +template +void AxpyInterface2::HandleGlobalToLocalData( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface::HandleGlobalToLocalData" ) ) + + if( !toBeAttachedForGet_ ) + LogicError( "Local matrix cannot be updated" ); + + const DistMatrix& Y = *GlobalArrayGet_; + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); + Int i, j; + Int matrix_index; + std::vector recvVector_; + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + for( Int step = 0; step < p; step++ ) + { + mpi::Status status; + + if( mpi::IProbe( mpi::ANY_SOURCE, REQUEST_GET_TAG, g.VCComm(), status ) ) + { + const Int source = status.MPI_SOURCE; + // post receive for coordinates + Int coord[3]; + mpi::TaggedRecv( coord, 3, source, + REQUEST_GET_TAG, g.VCComm() ); + i = coord[0]; + j = coord[1]; + // we need the localwidth/height here, + // used also to calculate numEntries + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + const Int numEntries = localHeight * localWidth; + + DEBUG_ONLY( if( numEntries < Int( sizeof( T ) ) ) + LogicError( "Count was too small" ); ) + const Int index = + NextIndexData( source, + numEntries, + Buffer, + &matrix_index ); + + DEBUG_ONLY( if + ( Int( matrices_[matrix_index].data_[source][index].size() ) != + numEntries ) LogicError( "Error in NextIndexData" ); ) + T* replyBuffer = matrices_[matrix_index].data_[source][index].data(); + + for( Int t = 0; t < localWidth; ++t ) + { + T* sendCol = &replyBuffer[t * localHeight]; + const T* XCol = Y.LockedBuffer( iLocalOffset, jLocalOffset + t ); + MemCopy( sendCol, XCol, localHeight ); + } + + // Fire off non-blocking send + mpi::TaggedISend( replyBuffer, numEntries, source, + DATA_GET_TAG, g.VCComm(), + matrices_[matrix_index].requests_[source][index] ); + } + + // receive data + if( mpi::IProbe + ( mpi::ANY_SOURCE, DATA_GET_TAG, g.VCComm(), status ) ) + { + const Int source = status.MPI_SOURCE; + // Ensure that we have a recv buffer + const Int count = mpi::GetCount ( status ); + recvVector_.resize( count ); + T* recvBuffer = recvVector_.data(); + // Receive the data + mpi::TaggedRecv + ( recvBuffer, count, source, DATA_GET_TAG, g.VCComm() ); + // Compute the local heights and offsets + const Int myRow = g.Row(); + const Int myCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int colShift = Shift( myRow, colAlign, r ); + const Int rowShift = Shift( myCol, rowAlign, c ); + const Int localHeight = Length( height, colShift, r ); + const Int localWidth = Length( width, rowShift, c ); + + // Unpack the local matrix + for( Int t = 0; t < localWidth; ++t ) + { + T* YCol = Z.Buffer( 0, rowShift + t * c ); + const T* XCol = &recvBuffer[t * localHeight]; + + for( Int s = 0; s < localHeight; ++s ) + YCol[colShift + s * r] = XCol[s]; + } + } + } + + recvVector_.clear(); +} + +// detach collectively +template +void AxpyInterface2::Detach() +{ + DEBUG_ONLY( CallStackEntry cse( "AxpyInterface2::Detach" ) ) + + // destructor will call detach again... + if( detached_ ) + return; + + if( !attached_ ) + LogicError( "Must attach before detaching." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + + mpi::Barrier( g.VCComm() ); + + attached_ = false; + detached_ = true; + + toBeAttachedForPut_ = false; + toBeAttachedForGet_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + + if( !dataVectors_.empty() ) + dataVectors_.clear(); + + matrices_.clear(); + coords_.clear(); +} + +#define PROTO(T) template class AxpyInterface2; +#include "El/macros/Instantiate.h" + +} // namespace El diff --git a/src/core/RmaInterface.cpp b/src/core/RmaInterface.cpp new file mode 100644 index 0000000000..a3e6420747 --- /dev/null +++ b/src/core/RmaInterface.cpp @@ -0,0 +1,1271 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Jeff Hammond (Intel) + Copyright (c) 2014, Sayan Ghosh (Washington State University) + All rights reserved. + +Authors: +Jeff Hammond adapted the RMA interface from the AXPY one. + +This file is part of Elemental and is under the BSD 2-Clause License, +which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +#include "El.hpp" +#include + +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) +namespace El +{ +template +RmaInterface::RmaInterface() + : GlobalArrayPut_( 0 ), GlobalArrayGet_( 0 ), + matrices_( 0 ), window( MPI_WIN_NULL ), + putVector_( 0 ), getVector_( 0 ), + toBeAttachedForPut_( false ), toBeAttachedForGet_( false ), + attached_( false ), detached_( true ) +{ } + +template +RmaInterface::RmaInterface( DistMatrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::RmaInterface" ) ) + + attached_ = false; + detached_ = true; + + toBeAttachedForGet_ = false; + toBeAttachedForPut_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + + window = MPI_WIN_NULL; +} + +// until attach, I am not setting anything +// which might not be a good thing to do, +// but would modify this eventually +template +RmaInterface::RmaInterface( const DistMatrix& X ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::RmaInterface" ) ) + + attached_ = false; + detached_ = true; + + toBeAttachedForGet_ = false; + toBeAttachedForPut_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + + window = MPI_WIN_NULL; +} + +template +RmaInterface::~RmaInterface() +{ + if( std::uncaught_exception() ) + { + std::ostringstream os; + os << "Uncaught exception detected during RmaInterface destructor " + "that required a call to Detach. Instead of allowing for the " + "possibility of Detach throwing another exception and " + "resulting in a 'terminate', we instead immediately dump the " + "call stack (if not in RELEASE mode) since the program will " + "likely hang:" << std::endl; + std::cerr << os.str(); + DEBUG_ONLY( DumpCallStack() ) + } + else + Detach(); +} + +template +void RmaInterface::Attach( DistMatrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Attach" ) ) + + // attached_ will be only set in Attach + // and only unset in Detach + if( !attached_ && detached_ ) + { + attached_ = true; + detached_ = false; + } + else + LogicError( "Must detach before reattaching." ); + + // if DistMatrix is non-const, all one-sided + // transfers -- put, get and acc are possible + if( !toBeAttachedForPut_ && !toBeAttachedForGet_ ) + { + GlobalArrayPut_ = &Z; + toBeAttachedForPut_ = true; + GlobalArrayGet_ = &Z; + toBeAttachedForGet_ = true; + + const Grid& g = Z.Grid(); + const Int p = g.Size(); + + if( matrices_.empty() ) + { + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + } + + if( putVector_.empty() ) + { + getVector_.resize( p ); + putVector_.resize( p ); + } + + // TODO rma related checks + // creation of window + const Int numEntries = Z.LocalHeight() * Z.LocalWidth(); + const Int bufferSize = numEntries * sizeof( T ); + void* baseptr = reinterpret_cast( Z.Buffer() ); + assert( baseptr != NULL ); + mpi::WindowCreate( baseptr, bufferSize, g.VCComm(), window ); + mpi::WindowLock( window ); + } +} + +// for gets +template +void RmaInterface::Attach( const DistMatrix& X ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Attach" ) ) + + if( !attached_ && detached_ ) + { + attached_ = true; + detached_ = false; + } + else + LogicError( "Must detach before reattaching." ); + + if( !toBeAttachedForGet_ ) + { + GlobalArrayGet_ = &X; + toBeAttachedForGet_ = true; + GlobalArrayPut_ = 0; + toBeAttachedForPut_ = false; + + const Grid& g = X.Grid(); + const Int p = g.Size(); + + if( getVector_.size() != p ) + getVector_.resize( p ); + + //TODO rma related checks + const Int numEntries = X.LocalHeight() * X.LocalWidth(); + const Int bufferSize = numEntries * sizeof( T ); + void* baseptr = static_cast( const_cast( X.LockedBuffer() ) ); + assert( baseptr != NULL ); + mpi::WindowCreate( baseptr, bufferSize, g.VCComm(), window ); + mpi::WindowLock( window ); + } +} + +// for standard passive rma +template +Int RmaInterface::NextIndex +( Int dataSize, + std::deque >& dataVectors ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::NextIndex" ) ) + const Int numCreated = dataVectors.size(); + dataVectors.resize( numCreated + 1 ); + dataVectors[numCreated].resize( dataSize ); + return numCreated; +} + +// for request-based passive rma +template +Int RmaInterface::NextIndex( + Int target, + Int dataSize, + const void* base_address, + Int* mindex ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::NextIndex" ) ) + assert( base_address != NULL ); + Int matrixIndex = 0; + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + const Int numMatrices = matrices_.size(); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + // uninitiated, first time + if( matrices_[m].base_ == NULL ) + { + matrices_[m].base_ = base_address; + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // need to create new object + if( matrixIndex == numMatrices ) + { + struct matrix_params_ mp; + mp.data_.resize( p ); + mp.requests_.resize( p ); + mp.statuses_.resize( p ); + mp.base_ = NULL; + // push back new matrix_params created + // with default constructor + matrices_.push_back( mp ); + matrices_[matrixIndex].base_ = base_address; + } + + // go through the request, data, + // status objects + const Int numCreated = matrices_[matrixIndex].data_[target].size(); + + DEBUG_ONLY( if( numCreated != Int( matrices_[matrixIndex].requests_[target].size() ) || + numCreated != Int( matrices_[matrixIndex].statuses_[target].size() ) ) + LogicError( "size mismatch" ); ) + + for( Int i = 0; i < numCreated; ++i ) + { + // If this request is still running, + // test to see if it finished. + if( matrices_[matrixIndex].statuses_[target][i] ) + { + const bool finished = mpi::Test( matrices_[matrixIndex].requests_[target][i] ); + matrices_[matrixIndex].statuses_[target][i] = !finished; + } + + if( !matrices_[matrixIndex].statuses_[target][i] ) + { + matrices_[matrixIndex].statuses_[target][i] = true; + matrices_[matrixIndex].data_[target][i].resize( dataSize ); + *mindex = matrixIndex; + return i; + } + } + + matrices_[matrixIndex].data_[target].resize( numCreated + 1 ); + matrices_[matrixIndex].data_[target][numCreated].resize( dataSize ); + matrices_[matrixIndex].requests_[target].push_back( mpi::REQUEST_NULL ); + matrices_[matrixIndex].statuses_[target].push_back( true ); + *mindex = matrixIndex; + return numCreated; +} + +// request based RMA operations +template +void RmaInterface::Rput( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Rput" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + + //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + const Int YLDim = Y.LDim(); + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast( const_cast( Z.LockedBuffer() ) ); + Int matrix_index; + + for( Int step=0; step( matrices_[matrix_index].data_[destination][index].data() ); + + for( Int t=0; t +void RmaInterface::Rput( Matrix& Z, Int i, Int j ) +{ Rput( const_cast&>( Z ), i, j ); } + +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width +template +void RmaInterface::Racc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Racc" ) ) + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated." ); + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative." ); + + DistMatrix& Y = *GlobalArrayPut_; + + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix." ); + + //do rma related checks + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + const Int YLDim = Y.LDim(); + + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + const T* XBuffer = Z.LockedBuffer(); + const void* Buffer = static_cast ( const_cast ( Z.LockedBuffer() ) ); + Int matrix_index; + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step( matrices_[matrix_index].data_[destination][index].data() ); + + for( Int t=0; t +void RmaInterface::Racc( Matrix& Z, Int i, Int j ) +{ Racc( const_cast&>( Z ), i, j ); } + +// Locally Blocking +template +void RmaInterface::Put( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Put" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + + //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + const Int YLDim = Y.LDim(); + const T* XBuffer = Z.LockedBuffer(); + + for( Int step=0; step +void RmaInterface::Put( Matrix& Z, Int i, Int j ) +{ Put( const_cast&>( Z ), i, j ); } + +// accumulate = Update Y(i:i+height-1,j:j+width-1) += X, +// where X is height x width +template +void RmaInterface::Acc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Acc" ) ) + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated." ); + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative." ); + + DistMatrix& Y = *GlobalArrayPut_; + + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix." ); + + //do rma related checks + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + const Int YLDim = Y.LDim(); + + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const T* XBuffer = Z.LockedBuffer(); + + for( Int step=0; step::NextIndex( numEntries, + putVector_[destination] ); + T* sendBuffer = putVector_[destination][index].data(); + + for( Int t=0; t +void RmaInterface::Acc( Matrix& Z, Int i, Int j ) +{ Acc( const_cast&>( Z ), i, j ); } + +// TODO Iget and Rget +template +void RmaInterface::Get( Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Get" ) ) + + // a call to Attach with a non-const DistMatrix must set + // toBeAttachedForGet_ also, if not then it is assumed that + // the DistMatrix isn't attached + if( !toBeAttachedForGet_ ) + LogicError( "Cannot perform this operation as matrix is not attached." ); + + const DistMatrix& X = *GlobalArrayGet_; + const Grid& g = X.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myRow = g.Row(); + const Int myCol = g.Col(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + // local width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + + if( i + height > X.Height() || j + width > X.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Int colAlign = ( X.ColAlign() + i ) % r; + const Int rowAlign = ( X.RowAlign() + j ) % c; + const Int iLocalOffset = Length( i, X.ColShift(), r ); + const Int jLocalOffset = Length( j, X.RowShift(), c ); + const Int XLDim = X.LDim(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step::NextIndex( numEntries, + getVector_[destination] ); + T* getBuffer = getVector_[destination][index].data(); + + // get + for( Int t=0; t +void RmaInterface::Iput( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Iput" ) ) + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative" ); + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated" ); + + DistMatrix& Y = *GlobalArrayPut_; + + //do rma related checks + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix" ); + + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + const Int YLDim = Y.LDim(); + const T* XBuffer = Z.LockedBuffer(); + + for( Int step=0; step +void RmaInterface::Iacc( const Matrix& Z, Int i, Int j ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Iacc" ) ) + + if( !toBeAttachedForPut_ ) + LogicError( "Global matrix cannot be updated." ); + + if( i < 0 || j < 0 ) + LogicError( "Submatrix offsets must be non-negative." ); + + DistMatrix& Y = *GlobalArrayPut_; + + if( i+Z.Height() > Y.Height() || j+Z.Width() > Y.Width() ) + LogicError( "Submatrix out of bounds of global matrix." ); + + //TODO rma related checks + const Grid& g = Y.Grid(); + const Int r = g.Height(); + const Int c = g.Width(); + const Int p = g.Size(); + const Int myProcessRow = g.Row(); + const Int myProcessCol = g.Col(); + const Int colAlign = ( Y.ColAlign() + i ) % r; + const Int rowAlign = ( Y.RowAlign() + j ) % c; + const Int XLDim = Z.LDim(); + const Int YLDim = Y.LDim(); + + // local matrix width and height + const Int height = Z.Height(); + const Int width = Z.Width(); + const T* XBuffer = Z.LockedBuffer(); + const Int iLocalOffset = Length( i, Y.ColShift(), r ); + const Int jLocalOffset = Length( j, Y.RowShift(), c ); + Int receivingRow = myProcessRow; + Int receivingCol = myProcessCol; + + for( Int step=0; step +void RmaInterface::Iput( Matrix& Z, Int i, Int j ) +{ Iput( const_cast&>( Z ), i, j ); } + +template +void RmaInterface::Iacc( Matrix& Z, Int i, Int j ) +{ Iacc( const_cast&>( Z ), i, j ); } + +// Local completion of all ops upon +// return +template +void RmaInterface::LocalFlush() +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::LocalFlush" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer before flushing." ); + + mpi::FlushLocal( window ); +} + +// Local completion (specific to Z) upon +// return +template +void RmaInterface::LocalFlush( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::LocalFlush" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer before flushing." ); + + // if there are no request based RMA pending + // for Z, then this functions acts like Flush + // local all + if( !anyPendingXfers( Z ) ) + LocalFlush(); + else + Wait( Z ); +} + +// there is no use as of now in +// passing Z, as mpi3 flush enforces +// completion of *all* operations on +// process window +template +void RmaInterface::Flush( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Flush" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer before flushing." ); + + mpi::Flush( window ); +} + +template +bool RmaInterface::anyPendingXfers( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::anyPendingXfers" ) ) + // by default, number of matrices + // == number of processes + Int matrixIndex; + const Int numMatrices = matrices_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // matrix not found + if( matrixIndex == numMatrices ) + return false; + + return true; +} + +// waitany implementation +// cannot use mpi::Waitany +// as of now because request +// objects are vector of deques +template +void RmaInterface::WaitAny( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::WaitAny" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex; + const Int numMatrices = matrices_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // matrix not found + if( matrixIndex == numMatrices ) + return; + + // data + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + if( !matrices_[matrixIndex].statuses_[rank][i] ) + { + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + return; + } + } + } +} + +template +void RmaInterface::Wait( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Wait" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex; + const Int numMatrices = matrices_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // matrix not found + if( matrixIndex == numMatrices ) + return; + + // data + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } +} + +template +void RmaInterface::Waitall() +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Waitall" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex; + const Int numMatrices = matrices_.size(); + + // data + for( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + { + for( int rank = 0; rank < p; ++rank ) + { + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + mpi::Wait( matrices_[matrixIndex].requests_[rank][i] ); + matrices_[matrixIndex].statuses_[rank][i] = true; + } + } + } +} + +template +bool RmaInterface::Test( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Test" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex; + const Int numMatrices = matrices_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // matrix not found + if( matrixIndex == numMatrices ) + return true; + + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); + + if( matrices_[matrixIndex].statuses_[rank][i] ) + return false; + } + } + + return true; +} + +// TODO Use mpi::Testany instead of mpi::Test +// at present request object is vector +// of deques, so cannot convert it to +// an array required by Testany +template +bool RmaInterface::TestAny( Matrix& Z ) +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::TestAny" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + Int matrixIndex; + const Int numMatrices = matrices_.size(); + const void* base_address = static_cast( const_cast( Z.LockedBuffer() ) ); + + // search for matrix base + for( Int m = 0; m < numMatrices; m++ ) + { + if( matrices_[m].base_ == base_address ) + { + matrixIndex = m; + break; + } + + matrixIndex = m+1; + } + + // matrix not found + if( matrixIndex == numMatrices ) + return true; + + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); + + if( matrices_[matrixIndex].statuses_[rank][i] ) + continue; + else + return true; + } + } + + return false; +} + +template +bool RmaInterface::Testall() +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Testall" ) ) + + if( !toBeAttachedForPut_ || !toBeAttachedForGet_ ) + LogicError( "Must initiate transfer at first." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + const Int p = g.Size(); + const Int numMatrices = matrices_.size(); + + // data + for( int matrixIndex = 0; matrixIndex < numMatrices; ++matrixIndex ) + { + for( int rank = 0; rank < p; ++rank ) + { + if( matrices_[matrixIndex].statuses_[rank].size() == 0 ) + continue; + + const Int numDataStatuses = matrices_[matrixIndex].requests_[rank].size(); + + for( int i = 0; i < numDataStatuses; i++ ) + { + matrices_[matrixIndex].statuses_[rank][i] = + !mpi::Test( matrices_[matrixIndex].requests_[rank][i] ); + + if( matrices_[matrixIndex].statuses_[rank][i] ) + return false; + } + } + } + + return true; +} + +template +void RmaInterface::Detach() +{ + DEBUG_ONLY( CallStackEntry cse( "RmaInterface::Detach" ) ) + + // destructor will call detach again... + if( detached_ ) + return; + + if( !attached_ ) + LogicError( "Must attach before detaching." ); + + const Grid& g = ( toBeAttachedForPut_ ? + GlobalArrayPut_->Grid() : + GlobalArrayGet_->Grid() ); + + mpi::Barrier( g.VCComm() ); + + attached_ = false; + detached_ = true; + + toBeAttachedForPut_ = false; + toBeAttachedForGet_ = false; + + GlobalArrayPut_ = 0; + GlobalArrayGet_ = 0; + + putVector_.clear(); + getVector_.clear(); + + matrices_.clear(); + + mpi::WindowUnlock( window ); + mpi::WindowFree( window ); +} + +#define PROTO(T) template class RmaInterface; +#include "El/macros/Instantiate.h" + +} // namespace El +#endif // EL_ENABLE_RMA_AXPY + diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp index 9d0f075959..8152a72dab 100644 --- a/src/core/imports/mpi.cpp +++ b/src/core/imports/mpi.cpp @@ -4,18 +4,19 @@ 2013, Jed Brown All rights reserved. - This file is part of Elemental and is under the BSD 2-Clause License, - which can be found in the LICENSE file in the root directory, or at - http://opensource.org/licenses/BSD-2-Clause + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause */ + #include "El.hpp" +#include typedef unsigned char* UCP; -namespace { - -inline void -SafeMpi( int mpiError ) +namespace +{ +inline void SafeMpi (int mpiError) { DEBUG_ONLY( if( mpiError != MPI_SUCCESS ) @@ -27,11 +28,12 @@ SafeMpi( int mpiError ) } ) } +} // anonymous namespace -} // anonymous namespace - -namespace El { -namespace mpi { +namespace El +{ +namespace mpi +{ bool CommSameSizeAsInteger() { return sizeof(MPI_Comm) == sizeof(int); } @@ -42,1718 +44,3533 @@ bool GroupSameSizeAsInteger() // MPI environmental routines // ========================== -void Initialize( int& argc, char**& argv ) -{ MPI_Init( &argc, &argv ); } +void Initialize (int &argc, char **&argv) +{ + MPI_Init (&argc, &argv); +} -int InitializeThread( int& argc, char**& argv, int required ) -{ - int provided; +int InitializeThread (int &argc, char **&argv, + int required) +{ + int provided; #ifdef EL_HAVE_MPI_INIT_THREAD - MPI_Init_thread( &argc, &argv, required, &provided ); + MPI_Init_thread (&argc, &argv, required, &provided); #else - MPI_Init( &argc, &argv ); - provided = 0; // equivalent to MPI_THREAD_SINGLE + MPI_Init (&argc, &argv); + provided = 0; // equivalent to MPI_THREAD_SINGLE #endif return provided; } -void Finalize() -{ MPI_Finalize(); } +void Finalize () +{ + MPI_Finalize (); +} -bool Initialized() -{ +bool Initialized () +{ int initialized; - MPI_Initialized( &initialized ); + + MPI_Initialized (&initialized); return initialized; } -bool Finalized() +bool Finalized () { int finalized; - MPI_Finalized( &finalized ); + + MPI_Finalized (&finalized); return finalized; } -int QueryThread() +int QueryThread () { int provided; + #ifdef EL_HAVE_MPI_QUERY_THREAD - MPI_Query_thread( &provided ); + MPI_Query_thread (&provided); #else - provided = 0; // equivalent to MPI_THREAD_SINGLE + provided = 0; // equivalent to MPI_THREAD_SINGLE #endif return provided; } -void Abort( Comm comm, int errCode ) -{ MPI_Abort( comm.comm, errCode ); } +void Abort (Comm comm, int errCode) +{ + MPI_Abort (comm.comm, errCode); +} -double Time() -{ return MPI_Wtime(); } +double Time () +{ + return MPI_Wtime (); +} -void Create( UserFunction* func, bool commutes, Op& op ) +void Create (UserFunction * func, bool commutes, Op & op) { - DEBUG_ONLY(CallStackEntry cse("mpi::Create")) - SafeMpi( MPI_Op_create( func, commutes, &op.op ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Create")) + SafeMpi (MPI_Op_create (func, commutes, &op.op)); } -void Free( Op& op ) +void Free (Op & op) { - DEBUG_ONLY(CallStackEntry cse("mpi::Free")) - SafeMpi( MPI_Op_free( &op.op ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Free")) + SafeMpi (MPI_Op_free (&op.op)); } -// Communicator manipulation +// Communicator manipulation // ========================= -int WorldRank() +int WorldRank () { - DEBUG_ONLY(CallStackEntry cse("mpi::WorldRank")) - return Rank( mpi::COMM_WORLD ); + DEBUG_ONLY (CallStackEntry cse ("mpi::WorldRank")) + return Rank (mpi::COMM_WORLD); } -int Rank( Comm comm ) +int Rank (Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Rank")) - if( comm != COMM_NULL ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Rank")) + if (comm != COMM_NULL) { int rank; - SafeMpi( MPI_Comm_rank( comm.comm, &rank ) ); + + SafeMpi (MPI_Comm_rank (comm.comm, &rank)); return rank; } - else return mpi::UNDEFINED; + else + return mpi::UNDEFINED; } -int Size( Comm comm ) +int Size (Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Size")) - if( comm != COMM_NULL ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Size")) + if (comm != COMM_NULL) { int size; - SafeMpi( MPI_Comm_size( comm.comm, &size ) ); + + SafeMpi (MPI_Comm_size (comm.comm, &size)); return size; - } - else return mpi::UNDEFINED; + } + else + return mpi::UNDEFINED; } -void Create( Comm parentComm, Group subsetGroup, Comm& subsetComm ) +void Create (Comm parentComm, Group subsetGroup, + Comm & subsetComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Create")) - SafeMpi( - MPI_Comm_create( parentComm.comm, subsetGroup.group, &subsetComm.comm ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Create")) + SafeMpi (MPI_Comm_create + (parentComm.comm, subsetGroup.group, + &subsetComm.comm)); } -void Dup( Comm original, Comm& duplicate ) +void Dup (Comm original, Comm & duplicate) { - DEBUG_ONLY(CallStackEntry cse("mpi::Dup")) - SafeMpi( MPI_Comm_dup( original.comm, &duplicate.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Dup")) + SafeMpi (MPI_Comm_dup + (original.comm, &duplicate.comm)); } -void Split( Comm comm, int color, int key, Comm& newComm ) +void Split (Comm comm, int color, int key, Comm & newComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Split")) - SafeMpi( MPI_Comm_split( comm.comm, color, key, &newComm.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Split")) + SafeMpi (MPI_Comm_split + (comm.comm, color, key, &newComm.comm)); } -void Free( Comm& comm ) +void Free (Comm & comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Free")) - SafeMpi( MPI_Comm_free( &comm.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Free")) + SafeMpi (MPI_Comm_free (&comm.comm)); } -bool Congruent( Comm comm1, Comm comm2 ) +bool Congruent (Comm comm1, Comm comm2) { - DEBUG_ONLY(CallStackEntry cse("mpi::Congruent")) - int result; - SafeMpi( MPI_Comm_compare( comm1.comm, comm2.comm, &result ) ); - return ( result == MPI_IDENT || result == MPI_CONGRUENT ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Congruent")) int + result; + SafeMpi (MPI_Comm_compare + (comm1.comm, comm2.comm, &result)); + return (result == MPI_IDENT + || result == MPI_CONGRUENT); } -void ErrorHandlerSet( Comm comm, ErrorHandler errorHandler ) +void ErrorHandlerSet (Comm comm, + ErrorHandler errorHandler) { - DEBUG_ONLY(CallStackEntry cse("mpi::ErrorHandlerSet")) + DEBUG_ONLY (CallStackEntry + cse ("mpi::ErrorHandlerSet")) #ifdef EL_HAVE_MPI_COMM_SET_ERRHANDLER - SafeMpi( MPI_Comm_set_errhandler( comm.comm, errorHandler ) ); + SafeMpi (MPI_Comm_set_errhandler + (comm.comm, errorHandler)); #else - SafeMpi( MPI_Errhandler_set( comm.comm, errorHandler ) ); + SafeMpi (MPI_Errhandler_set + (comm.comm, errorHandler)); #endif } -// Cartesian communicator routines +// Cartesian communicator routines // =============================== void CartCreate -( Comm comm, int numDims, const int* dimensions, const int* periods, - bool reorder, Comm& cartComm ) +(Comm comm, int numDims, const int *dimensions, + const int *periods, bool reorder, Comm & cartComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::CartCreate")) + DEBUG_ONLY (CallStackEntry cse ("mpi::CartCreate")) SafeMpi - ( MPI_Cart_create - ( comm.comm, numDims, const_cast(dimensions), - const_cast(periods), reorder, &cartComm.comm ) ); + (MPI_Cart_create + (comm.comm, numDims, + const_cast < int *>(dimensions), + const_cast < int *>(periods), reorder, + &cartComm.comm)); } -void CartSub( Comm comm, const int* remainingDims, Comm& subComm ) +void CartSub (Comm comm, const int *remainingDims, + Comm & subComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::CartSub")) - SafeMpi( - MPI_Cart_sub - ( comm.comm, const_cast(remainingDims), &subComm.comm ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::CartSub")) + SafeMpi (MPI_Cart_sub + (comm.comm, + const_cast < int *>(remainingDims), + &subComm.comm)); } -// Group manipulation +// Group manipulation // ================== -int Rank( Group group ) +int Rank (Group group) { - DEBUG_ONLY(CallStackEntry cse("mpi::Rank")) - int rank; - SafeMpi( MPI_Group_rank( group.group, &rank ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Rank")) int + rank; + SafeMpi (MPI_Group_rank (group.group, &rank)); return rank; } -int Size( Group group ) +int Size (Group group) { - DEBUG_ONLY(CallStackEntry cse("mpi::Size")) - int size; - SafeMpi( MPI_Group_size( group.group, &size ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Size")) int + size; + SafeMpi (MPI_Group_size (group.group, &size)); return size; } -void CommGroup( Comm comm, Group& group ) +void CommGroup (Comm comm, Group & group) { - DEBUG_ONLY(CallStackEntry cse("mpi::CommGroup")) - SafeMpi( MPI_Comm_group( comm.comm, &group.group ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::CommGroup")) + SafeMpi (MPI_Comm_group + (comm.comm, &group.group)); } -void Dup( Group group, Group& newGroup ) +void Dup (Group group, Group & newGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Dup")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Dup")) // For some reason, MPI_Group_dup does not exist - Excl( group, 0, 0, newGroup ); + Excl (group, 0, 0, newGroup); } -void Union( Group groupA, Group groupB, Group& newGroup ) +void Union (Group groupA, Group groupB, Group & newGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Union")) - SafeMpi( MPI_Group_union( groupA.group, groupB.group, &newGroup.group ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Union")) + SafeMpi (MPI_Group_union + (groupA.group, groupB.group, + &newGroup.group)); } -void Incl( Group group, int n, const int* ranks, Group& subGroup ) +void Incl (Group group, int n, const int *ranks, + Group & subGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Incl")) - SafeMpi( - MPI_Group_incl - ( group.group, n, const_cast(ranks), &subGroup.group ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Incl")) + SafeMpi (MPI_Group_incl + (group.group, n, + const_cast < int *>(ranks), + &subGroup.group)); } -void Excl( Group group, int n, const int* ranks, Group& subGroup ) +void Excl (Group group, int n, const int *ranks, + Group & subGroup) { - DEBUG_ONLY(CallStackEntry cse("mpi::Excl")) - SafeMpi( - MPI_Group_excl - ( group.group, n, const_cast(ranks), &subGroup.group ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Excl")) + SafeMpi (MPI_Group_excl + (group.group, n, + const_cast < int *>(ranks), + &subGroup.group)); } -void Difference( Group parent, Group subset, Group& complement ) +void Difference (Group parent, Group subset, + Group & complement) { - DEBUG_ONLY(CallStackEntry cse("mpi::Difference")) - SafeMpi( - MPI_Group_difference( parent.group, subset.group, &complement.group ) - ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Difference")) + SafeMpi (MPI_Group_difference + (parent.group, subset.group, + &complement.group)); } -void Free( Group& group ) +void Free (Group & group) { - DEBUG_ONLY(CallStackEntry cse("mpi::Free")) - SafeMpi( MPI_Group_free( &group.group ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Free")) + SafeMpi (MPI_Group_free (&group.group)); } // Rank translations // ================= -int Translate( Group origGroup, int origRank, Group newGroup ) +int Translate (Group origGroup, int origRank, + Group newGroup) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) int + newRank; + Translate (origGroup, 1, &origRank, newGroup, + &newRank); + return newRank; +} + +int Translate (Comm origComm, int origRank, + Group newGroup) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) int + newRank; + Translate (origComm, 1, &origRank, newGroup, + &newRank); + return newRank; +} + +int Translate (Group origGroup, int origRank, + Comm newComm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) int + newRank; + Translate (origGroup, 1, &origRank, newComm, + &newRank); + return newRank; +} + +int Translate (Comm origComm, int origRank, Comm newComm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - int newRank; - Translate( origGroup, 1, &origRank, newGroup, &newRank ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) int + newRank; + Translate (origComm, 1, &origRank, newComm, &newRank); return newRank; } -int Translate( Comm origComm, int origRank, Group newGroup ) +void Translate +(Group origGroup, int size, const int *origRanks, + Group newGroup, int *newRanks) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) + SafeMpi + (MPI_Group_translate_ranks + (origGroup.group, size, + const_cast < int *>(origRanks), + newGroup.group, newRanks)); +} + +void Translate +(Comm origComm, int size, const int *origRanks, + Group newGroup, int *newRanks) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) + Group origGroup; + + CommGroup (origComm, origGroup); + Translate (origGroup, size, origRanks, newGroup, + newRanks); + Free (origGroup); +} + +void Translate +(Group origGroup, int size, const int *origRanks, + Comm newComm, int *newRanks) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) + Group newGroup; + + CommGroup (newComm, newGroup); + Translate (origGroup, size, origRanks, newGroup, + newRanks); + Free (newGroup); +} + +void Translate +(Comm origComm, int size, const int *origRanks, + Comm newComm, int *newRanks) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Translate")) + Group origGroup, newGroup; + + CommGroup (origComm, origGroup); + CommGroup (newComm, newGroup); + Translate (origGroup, size, origRanks, newGroup, + newRanks); + Free (origGroup); + Free (newGroup); +} + +// DERIVED Datatype creation +// ========================= +// FIXME these functions for DDT creation are +// completely untested +#ifdef EL_USE_DERIVED_DATATYPE +void StridedDatatype (El_strided_t* stride_descr, + Datatype old_type, Datatype* new_type, + size_t* source_dims) +{ + int old_type_size; + SafeMpi (MPI_Type_size (old_type, &old_type_size)); + int *dims = NULL, *sizes = NULL; + + // count of blocks must be non-zero + assert (stride_descr->num > 0); + // size is NULL + assert (stride_descr->sizes != NULL); + // offset is NULL + assert (stride_descr->offsets != NULL); + + // check for contiguous transfers + if ((source_dims == NULL) && (stride_descr->num == 1)) + { + int elem_count = stride_descr->sizes[0] / old_type_size; + // derived datatype is not a multiple of original type + assert ((stride_descr->sizes[0] % old_type_size == 0)); + SafeMpi ( MPI_Type_contiguous (elem_count, old_type, new_type) ); + return; + } + // offsets should be monotonic increasing + for (int i = 1; i < stride_descr->num; i++) + assert (stride_descr->offsets[i] >= stride_descr->offsets[i - 1]); + /* Notes: + * Sayan: This weird hack is because MPI_Type_create_subarray throws an error when + * stride_descr->sizes and source_dims is passed directly (probably type mismatch?) */ + /* heap */ + dims = new int[stride_descr->num]; + sizes = new int[stride_descr->num]; + + for (int i = 0; i < stride_descr->num; i++) + { + dims[i] = EL_INT_SAFE_CAST (source_dims[i]); + sizes[i] = EL_INT_SAFE_CAST (stride_descr->sizes[i]); + } + + SafeMpi ( MPI_Type_create_subarray (stride_descr->num, reinterpret_cast(dims), + reinterpret_cast(sizes), + reinterpret_cast(stride_descr->offsets), MPI_ORDER_C, + old_type, new_type) ); + + delete[] dims; + delete[] sizes; +} + +void VectorDatatype (El_iov_t * vect_descr, + Datatype old_type, Datatype * new_type, + vector_pattern_t data_pattern) +{ + int old_type_size; + int stride; + int fixed_block_fixed_stride = 1, // MPI_Type_vector + fixed_block_var_stride = 1; // MPI_Type_hindexed_block + /* defaults: + * var_block_var_stride=1 - MPI_Type_hindexed + * var_block_fixed_stride=1 - MPI_Type_hindexed + */ + SafeMpi ( MPI_Type_size (old_type, &old_type_size) ); + // count of blocks must be non-zero + assert (vect_descr->count > 0); + // size is NULL + assert (vect_descr->sizes != NULL); + // offset is NULL + assert (vect_descr->offsets != NULL); + // check for contiguous transfers + if (vect_descr->count == 1) + { + int elem_count = vect_descr->sizes[0] / old_type_size; + // derived datatype is not a multiple of original type + assert (vect_descr->sizes[0] % old_type_size == 0); + SafeMpi ( MPI_Type_contiguous (elem_count, old_type, new_type) ); + return; + } + // offsets should be monotonic increasing + for (int i = 1; i < vect_descr->count; i++) + assert (vect_descr->offsets[i] >= vect_descr->offsets[i - 1]); + + // identify the pattern of strides, fixed or varying + if (data_pattern == UNKNOWN_BLOCK_STRIDE) + { + stride = (vect_descr->offsets[1] - vect_descr->offsets[0]); + for (int i = 1; i < vect_descr->count; i++) + { + // check for fixed blocklengths and fixed strides + if ((vect_descr->sizes[i] == vect_descr->sizes[i - 1]) && + (stride == + (vect_descr->offsets[i] - vect_descr->offsets[i - 1]))) + fixed_block_fixed_stride++; + + // check for fixed blocklengths and variable strides + if ((vect_descr->sizes[i] == vect_descr->sizes[i - 1]) && + !(stride == + (vect_descr->offsets[i] - vect_descr->offsets[i - 1]))) + fixed_block_var_stride++; + } + } + + if (data_pattern == FIXED_BLOCK_FIXED_STRIDE) + fixed_block_fixed_stride = vect_descr->count; + + if (data_pattern == FIXED_BLOCK_VAR_STRIDE) + fixed_block_var_stride = vect_descr->count; + + // check if constant strides, if yes + // then create _type_vector, else + // _type_hindexed + if (fixed_block_fixed_stride == vect_descr->count) + { // _vector + int stride = ((vect_descr->offsets[1] - vect_descr->offsets[0]) + / old_type_size); + int blocklength = vect_descr->sizes[0]; + SafeMpi ( MPI_Type_vector (vect_descr->count, blocklength, + stride, old_type, new_type) ); + } + else if (fixed_block_var_stride == vect_descr->count) // _hindexed_block + SafeMpi ( MPI_Type_create_hindexed_block (vect_descr->count, vect_descr->sizes[0], + vect_descr->offsets, old_type, new_type) ); + else // _hindexed + SafeMpi ( MPI_Type_create_hindexed (vect_descr->count, + (const int *) vect_descr->sizes, + vect_descr->offsets, old_type, new_type) ); +} +#endif // EL_USE_DERIVED_DATATYPE + +// MPI-3 RMA functions +// ================== + +#if MPI_VERSION>=3 && defined(EL_ENABLE_RMA_AXPY) +long ReadInc (Window & win, Aint offset, long inc, int fop_root) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::ReadInc")) + long otemp; + SafeMpi ( MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, fop_root, offset, MPI_SUM, + win) ); + SafeMpi ( MPI_Win_flush_local (fop_root, win) ); + + return otemp; +} + +void SetWindowProp (Window & window, int prop) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::SetWindowProp")) + Info info; + + SafeMpi (MPI_Info_create (&info)); + + if (prop & (1 << 0)) // strict + SafeMpi (MPI_Info_set + (info, "accumulate_ordering", + "rar,raw,war,waw")); + + + if (prop & (1 << 1)) // partial + SafeMpi (MPI_Info_set + (info, "accumulate_ordering", + "rar,waw")); + + if (prop & (1 << 2)) // none + SafeMpi (MPI_Info_set + (info, "accumulate_ops", + "same_op_no_op")); + + SafeMpi (MPI_Win_set_info (window, info)); +} + +//NOTE assuming MPI_MODE_NOCHECK +void WindowLock (int rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowLock")) + SafeMpi (MPI_Win_lock + (MPI_LOCK_SHARED, rank, MPI_MODE_NOCHECK, + window)); +} + +void WindowLock (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowLock")) + SafeMpi (MPI_Win_lock_all + (MPI_MODE_NOCHECK, window)); +} + +void WindowUnlock (int rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowUnlock")) + SafeMpi (MPI_Win_unlock (rank, window)); +} + +void WindowUnlock (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowUnlock")) + SafeMpi (MPI_Win_unlock_all (window)); +} + +// RMA Utilities +void WindowCreate (void *baseptr, int size, Comm comm, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowCreate")) + + // TODO use alloc_shm + SafeMpi (MPI_Win_create + (baseptr, (MPI_Aint) size, 1, MPI_INFO_NULL, + comm.comm, &window)); +#ifdef EL_NO_ACC_ORDERING + SetWindowProp (window, NO_ACC_ORDERING); +#endif +} + +void CheckBounds (Window & window, Datatype win_type, Datatype type, + size_t count, ptrdiff_t target_offset) +{ + int flag, type_size, win_type_size; + size_t displ; + void * dest=NULL; + + SafeMpi (MPI_Type_size (type, &type_size)); + SafeMpi (MPI_Type_size (win_type, &win_type_size)); + Aint lb, extent; + + SafeMpi (MPI_Win_get_attr(window, MPI_WIN_BASE, dest, &flag /* unused */)); + + /* Calculate displacement from beginning of the window */ + if (dest == MPI_BOTTOM) + displ = 0; + else + displ = (size_t) ((uint8_t*)((uint8_t*)dest + target_offset * type_size) - (uint8_t*)dest); + + SafeMpi (MPI_Type_get_true_extent(type, &lb, &extent)); + + // invalid remote address + assert (displ >= 0 && displ < win_type_size); + // transfer out of range + assert (displ + count*extent <= win_type_size); +} + +#ifdef EL_EXPLICIT_PROGRESS +void RmaProgress ( Comm comm ) +{ + int flag; + SafeMpi (MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, + comm.comm, &flag, MPI_STATUS_IGNORE)); +} +#endif + +void WindowFree (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::WindowFree")) + SafeMpi (MPI_Win_free (&window)); +} + +// put +template +void Iput (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iput")) +#ifdef EL_ENSURE_PUT_ATOMICITY + SafeMpi (MPI_Accumulate + (source, origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), MPI_REPLACE, window)); +#else + SafeMpi (MPI_Put + (source, origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), window)); +#endif +} + +template +void Iput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iput")) +#ifdef EL_ENSURE_PUT_ATOMICITY +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Accumulate + (source, 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), MPI_REPLACE, window)); +#else + SafeMpi (MPI_Accumulate + (source, origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), MPI_REPLACE, window)); +#endif +#else +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Put + (source, 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), window)); +#else + SafeMpi (MPI_Put + (source, origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), window)); +#endif +#endif +} + +template +void Rput (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) +#ifdef EL_ENSURE_PUT_ATOMICITY + SafeMpi (MPI_Raccumulate + (source, origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), MPI_REPLACE, window, &request)); +#else + SafeMpi (MPI_Rput + (source, origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), window, &request)); +#endif +} + +template +void Rput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rput")) +#ifdef EL_ENSURE_PUT_ATOMICITY +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Raccumulate + (source, 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), MPI_REPLACE, window, &request)); +#else + SafeMpi (MPI_Raccumulate + (source, origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), MPI_REPLACE, window, &request)); +#endif +#else +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Rput + (source, 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), window, &request)); +#else + SafeMpi (MPI_Rput + (source, origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), window, &request)); +#endif +#endif +} +template void Iput (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iput (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#endif +template void Iput (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); + +template void Rput (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Rput (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#endif +template void Rput (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rput (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); + +// when source-target size == 1 +template +void Iput( T source, int target_rank, Aint disp, Window& window ) +{ + Iput ( &source, 1, target_rank, disp, 1, window ); +} + +template +void Rput( T source, int target_rank, Aint disp, + Window& window, Request& request ) +{ + Rput ( &source, 1, target_rank, disp, 1, window, request ); +} + +template void Rput (const byte source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const unsigned source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const unsigned long source, int target_rank, + Aint disp, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Rput (const long long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const unsigned long long source, int target_rank, + Aint disp, Window & window, Request & request); +#endif +template void Rput (const float source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const double source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rput (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); + +template void Iput (const byte source, int target_rank, + Aint disp, Window & window); +template void Iput (const int source, int target_rank, + Aint disp, Window & window); +template void Iput (const unsigned source, int target_rank, + Aint disp, Window & window); +template void Iput (const long int source, int target_rank, + Aint disp, Window & window); +template void Iput (const unsigned long source, int target_rank, + Aint disp, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iput (const long long int source, int target_rank, + Aint disp, Window & window); +template void Iput (const unsigned long long source, int target_rank, + Aint disp, Window & window); +#endif +template void Iput (const float source, int target_rank, + Aint disp, Window & window); +template void Iput (const double source, int target_rank, + Aint disp, Window & window); +template void Iput (const Complex source, int target_rank, + Aint disp, Window & window); +template void Iput (const Complex source, int target_rank, + Aint disp, Window & window); +// get +template +void Iget (R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) +#ifdef EL_ENSURE_GET_ATOMICITY + SafeMpi (MPI_Get_accumulate + (NULL, 0, TypeMap(), source, + origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), MPI_NO_OP, window)); +#else + SafeMpi (MPI_Get + (source, origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), window)); +#endif +} + +template +void Iget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iget")) +#ifdef EL_ENSURE_GET_ATOMICITY +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Get_accumulate + (NULL, 0, TypeMap(), source, + 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), MPI_NO_OP, window)); +#else + SafeMpi (MPI_Get_accumulate + (NULL, 0, TypeMap>(), source, + origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), MPI_NO_OP, window)); +#endif +#else +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Get + (source, 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), window)); +#else + SafeMpi (MPI_Get + (source, origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), window)); +#endif +#endif +} + +template +void Rget (R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) +#ifdef EL_ENSURE_GET_ATOMICITY + SafeMpi (MPI_Rget_accumulate + (NULL, 0, TypeMap(), source, + origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), MPI_NO_OP, window, + &request)); +#else + SafeMpi (MPI_Rget + (source, origin_count, TypeMap(), + target_rank, disp, target_count, + TypeMap(), window, &request)); +#endif +} + +template +void Rget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Rget")) +#ifdef EL_ENSURE_GET_ATOMICITY +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Rget_accumulate + (NULL, 0, TypeMap(), source, + 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), MPI_NO_OP, window, &request)); +#else + SafeMpi (MPI_Rget_accumulate + (NULL, 0, TypeMap>(), source, + origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), MPI_NO_OP, window, &request)); +#endif +#else +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Rget + (source, 2*origin_count, TypeMap(), + target_rank, disp, 2*target_count, + TypeMap(), window, &request)); +#else + SafeMpi (MPI_Rget + (source, origin_count, TypeMap>(), + target_rank, disp, target_count, + TypeMap>(), window, &request)); +#endif +#endif +} +template void Iget (byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iget (long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#endif +template void Iget (float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); + +template void Rget (byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Rget (long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#endif +template void Rget (float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Rget (Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); + +// when source-target size == 1 +template +void Iget( T source, int target_rank, Aint disp, Window& window ) +{ + Iget ( &source, 1, target_rank, disp, 1, window ); +} + +template +void Rget( T source, int target_rank, Aint disp, + Window& window, Request& request ) +{ + Rget ( &source, 1, target_rank, disp, 1, window, request ); +} + +template void Rget (byte source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (unsigned source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (unsigned long source, int target_rank, + Aint disp, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Rget (long long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (unsigned long long source, int target_rank, + Aint disp, Window & window, Request & request); +#endif +template void Rget (float source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (double source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (Complex source, int target_rank, + Aint disp, Window & window, Request & request); +template void Rget (Complex source, int target_rank, + Aint disp, Window & window, Request & request); + +template void Iget (byte source, int target_rank, + Aint disp, Window & window); +template void Iget (int source, int target_rank, + Aint disp, Window & window); +template void Iget (unsigned source, int target_rank, + Aint disp, Window & window); +template void Iget (long int source, int target_rank, + Aint disp, Window & window); +template void Iget (unsigned long source, int target_rank, + Aint disp, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iget (long long int source, int target_rank, + Aint disp, Window & window); +template void Iget (unsigned long long source, int target_rank, + Aint disp, Window & window); +#endif +template void Iget (float source, int target_rank, + Aint disp, Window & window); +template void Iget (double source, int target_rank, + Aint disp, Window & window); +template void Iget (Complex source, int target_rank, + Aint disp, Window & window); +template void Iget (Complex source, int target_rank, + Aint disp, Window & window); + +// acc +template +void Iacc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) + SafeMpi (MPI_Accumulate + (source, origin_count, + TypeMap(), target_rank, disp, + target_count, TypeMap(), op.op, + window)); +} + +template +void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Iaccumulate")) +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Accumulate + (source, 2*origin_count, + TypeMap(), target_rank, disp, + 2*target_count, TypeMap(), op.op, + window)); +#else + SafeMpi (MPI_Accumulate + (source, origin_count, + TypeMap>(), target_rank, disp, + target_count, TypeMap>(), op.op, + window)); +#endif +} + +template +void Racc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) + SafeMpi (MPI_Raccumulate + (source, origin_count, + TypeMap(), target_rank, disp, + target_count, TypeMap(), op.op, + window, &request)); +} + +template +void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Raccumulate")) +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi (MPI_Raccumulate + (source, 2*origin_count, + TypeMap(), target_rank, disp, + 2*target_count, TypeMap(), op.op, + window, &request)); +#else + SafeMpi (MPI_Raccumulate + (source, origin_count, + TypeMap>(), target_rank, disp, + target_count, TypeMap>(), op.op, + window, &request)); +#endif +} +template void Iacc (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iacc (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +#endif +template void Iacc (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); +template void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window); + +template void Racc (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Racc (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +#endif +template void Racc (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); +template void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Op op, Window & window, Request & request); + +// op = SUM +template +void Iacc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window) +{ + Iacc ( source, origin_count, target_rank, disp, target_count, SUM, window ); +} + +template +void Racc (const R* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, + Request & request) +{ + Racc ( source, origin_count, target_rank, disp, target_count, SUM, window, request ); +} + +template void Iacc (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iacc (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +#endif +template void Iacc (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); +template void Iacc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window); + +template void Racc (const byte* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const unsigned* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const unsigned long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Racc (const long long int* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const unsigned long long* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +#endif +template void Racc (const float* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const double* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); +template void Racc (const Complex* source, int origin_count, int target_rank, + Aint disp, int target_count, Window & window, Request & request); + +// when source-target size == 1 and op = SUM +template +void Iacc (const T source, int target_rank, Aint disp, Window & window) +{ + Iacc ( &source, 1, target_rank, disp, 1, SUM, window ); +} + +template +void Racc (const T source, int target_rank, Aint disp, Window & window, + Request & request) +{ + Racc ( &source, 1, target_rank, disp, 1, SUM, window, request ); +} + +template void Racc (const byte source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const unsigned source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const unsigned long source, int target_rank, + Aint disp, Window & window, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Racc (const long long int source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const unsigned long long source, int target_rank, + Aint disp, Window & window, Request & request); +#endif +template void Racc (const float source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const double source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); +template void Racc (const Complex source, int target_rank, + Aint disp, Window & window, Request & request); + +template void Iacc (const byte source, int target_rank, + Aint disp, Window & window); +template void Iacc (const int source, int target_rank, + Aint disp, Window & window); +template void Iacc (const unsigned source, int target_rank, + Aint disp, Window & window); +template void Iacc (const long int source, int target_rank, + Aint disp, Window & window); +template void Iacc (const unsigned long source, int target_rank, + Aint disp, Window & window); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Iacc (const long long int source, int target_rank, + Aint disp, Window & window); +template void Iacc (const unsigned long long source, int target_rank, + Aint disp, Window & window); +#endif +template void Iacc (const float source, int target_rank, + Aint disp, Window & window); +template void Iacc (const double source, int target_rank, + Aint disp, Window & window); +template void Iacc (const Complex source, int target_rank, + Aint disp, Window & window); +template void Iacc (const Complex source, int target_rank, + Aint disp, Window & window); + +// Synchronization +// --------------- +void Flush (int target_rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) + SafeMpi (MPI_Win_flush (target_rank, window)); +} + +void Flush (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Flush")) + SafeMpi (MPI_Win_flush_all (window)); +} + +void FlushLocal (int target_rank, Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::FlushLocal")) + SafeMpi (MPI_Win_flush_local (target_rank, window)); +} + +void FlushLocal (Window & window) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::FlushLocal")) + SafeMpi (MPI_Win_flush_local_all (window)); +} +#endif // EL_ENABLE_RMA_AXPY + +// Various utilities +// ================= +// Free request +void RequestFree (Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::RequestFree")) + SafeMpi (MPI_Request_free (&request)); +} + +// Wait until every process in comm reaches this statement +void Barrier (Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Barrier")) + SafeMpi (MPI_Barrier (comm.comm)); +} + +#if MPI_VERSION>=3 && defined(EL_USE_IBARRIER_FOR_AXPY) +void IBarrier (Comm comm, Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::IBarrier")) + SafeMpi (MPI_Ibarrier (comm.comm, &request)); +} +#endif + + +// Test for completion +bool Test (Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Test")) + + Status status; + int flag; + + SafeMpi (MPI_Test (&request, &flag, &status)); + if (flag) + return true; + else + return false; +} + +bool Test (Request & request, Status & status) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Test")) + int flag; + SafeMpi (MPI_Test (&request, &flag, &status)); + + if (flag) + return true; + else + return false; +} + +bool Testany (int count, Request * requests, int &indx, + Status & status) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) + int flag; + SafeMpi (MPI_Testany + (count, requests, &indx, &flag, &status)); + if (flag) + return true; + else + return false; +} + +bool Testany (int count, Request * requests, int &indx) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - int newRank; - Translate( origComm, 1, &origRank, newGroup, &newRank ); - return newRank; + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) + + int flag; + Status status; + + SafeMpi (MPI_Testany + (count, requests, &indx, &flag, &status)); + if (flag) + return true; + else + return false; } -int Translate( Group origGroup, int origRank, Comm newComm ) +bool Testany (int count, Request * requests) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - int newRank; - Translate( origGroup, 1, &origRank, newComm, &newRank ); - return newRank; + DEBUG_ONLY (CallStackEntry cse ("mpi::Testany")) + + int flag, indx; + Status status; + + SafeMpi (MPI_Testany + (count, requests, &indx, &flag, &status)); + if (flag) + return true; + else + return false; } -int Translate( Comm origComm, int origRank, Comm newComm ) +// Ensure that the request finishes before continuing +void Wait (Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - int newRank; - Translate( origComm, 1, &origRank, newComm, &newRank ); - return newRank; + DEBUG_ONLY (CallStackEntry cse ("mpi::Wait")) + SafeMpi (MPI_Wait (&request, MPI_STATUS_IGNORE)); } -void Translate -( Group origGroup, int size, const int* origRanks, - Group newGroup, int* newRanks ) +// Ensure that the request finishes before continuing +void Wait (Request & request, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - SafeMpi - ( MPI_Group_translate_ranks - ( origGroup.group, size, const_cast(origRanks), - newGroup.group, newRanks ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Wait")) + SafeMpi (MPI_Wait (&request, &status)); } -void Translate -( Comm origComm, int size, const int* origRanks, - Group newGroup, int* newRanks ) +// Ensure that several requests finish before continuing +void WaitAll (int numRequests, Request * requests) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - Group origGroup; - CommGroup( origComm, origGroup ); - Translate( origGroup, size, origRanks, newGroup, newRanks ); - Free( origGroup ); + + DEBUG_ONLY(CallStackEntry cse("mpi::WaitAll")) + vector statuses( numRequests ); + SafeMpi( MPI_Waitall( numRequests, requests, statuses.data() ) ); } -void Translate -( Group origGroup, int size, const int* origRanks, - Comm newComm, int* newRanks ) +// Ensure that several requests finish before continuing +void WaitAll (int numRequests, Request * requests, + Status * statuses) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - Group newGroup; - CommGroup( newComm, newGroup ); - Translate( origGroup, size, origRanks, newGroup, newRanks ); - Free( newGroup ); + DEBUG_ONLY (CallStackEntry cse ("mpi::WaitAll")) + SafeMpi (MPI_Waitall + (numRequests, requests, statuses)); } -void Translate -( Comm origComm, int size, const int* origRanks, - Comm newComm, int* newRanks ) +// Ensure that any requests finish before continuing +void WaitAny (int numRequests, Request * requests, Int * index) { - DEBUG_ONLY(CallStackEntry cse("mpi::Translate")) - Group origGroup, newGroup; - CommGroup( origComm, origGroup ); - CommGroup( newComm, newGroup ); - Translate( origGroup, size, origRanks, newGroup, newRanks ); - Free( origGroup ); - Free( newGroup ); + DEBUG_ONLY (CallStackEntry cse ("mpi::WaitAny")) + SafeMpi (MPI_Waitany + (numRequests, requests, index, MPI_STATUS_IGNORE)); } -// Various utilities -// ================= - -// Wait until every process in comm reaches this statement -void Barrier( Comm comm ) +// Nonblocking test for message completion +bool IProbe (int source, int tag, Comm comm, + Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::Barrier")) - SafeMpi( MPI_Barrier( comm.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::IProbe")) int + flag; + SafeMpi (MPI_Iprobe + (source, tag, comm.comm, &flag, &status)); + return flag; } -// Test for completion -bool Test( Request& request ) +bool IProbe (int source, Comm comm, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::Test")) - Status status; - int flag; - SafeMpi( MPI_Test( &request, &flag, &status ) ); - return flag; + return IProbe (source, mpi::ANY_TAG, comm, status); } -// Ensure that the request finishes before continuing -void Wait( Request& request ) +bool IProbe (Comm comm, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::Wait")) - Status status; - SafeMpi( MPI_Wait( &request, &status ) ); + return IProbe (mpi::ANY_SOURCE, mpi::ANY_TAG, comm, status); } -// Ensure that the request finishes before continuing -void Wait( Request& request, Status& status ) +void Probe (int source, int tag, Comm comm, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::Wait")) - SafeMpi( MPI_Wait( &request, &status ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Probe")) + SafeMpi (MPI_Probe(source, tag, comm.comm, &status)); } -// Ensure that several requests finish before continuing -void WaitAll( int numRequests, Request* requests ) +void Probe (int source, Comm comm, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::WaitAll")) - vector statuses( numRequests ); - SafeMpi( MPI_Waitall( numRequests, requests, statuses.data() ) ); + Probe (source, mpi::ANY_TAG, comm, status); } -// Ensure that several requests finish before continuing -void WaitAll( int numRequests, Request* requests, Status* statuses ) +void Probe (Comm comm, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::WaitAll")) - SafeMpi( MPI_Waitall( numRequests, requests, statuses ) ); + Probe (mpi::ANY_SOURCE, mpi::ANY_TAG, comm, status); } -// Nonblocking test for message completion -bool IProbe( int source, int tag, Comm comm, Status& status ) +bool IMprobe (int source, int tag, Comm comm, + Status & status, Message & message) { - DEBUG_ONLY(CallStackEntry cse("mpi::IProbe")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IMprobe")) int flag; - SafeMpi( MPI_Iprobe( source, tag, comm.comm, &flag, &status ) ); + SafeMpi (MPI_Improbe + (source, tag, comm.comm, &flag, &message, &status)); return flag; } -bool IProbe( int source, Comm comm, Status& status ) -{ return IProbe( source, 0, comm, status ); } -template -int GetCount( Status& status ) +template < typename T > int GetCount (Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::GetCount")) - int count; - SafeMpi( MPI_Get_count( &status, TypeMap(), &count ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::GetCount")) int + count; + SafeMpi (MPI_Get_count + (&status, TypeMap < T > (), &count)); return count; } -template int GetCount( Status& status ); -template int GetCount( Status& status ); -template int GetCount( Status& status ); -template int GetCount( Status& status ); -template int GetCount( Status& status ); +template int GetCount < byte > (Status & status); +template int GetCount < int >(Status & status); +template int GetCount < unsigned >(Status & status); +template int GetCount < long int >(Status & status); +template int GetCount < unsigned long >(Status & status); + #ifdef EL_HAVE_MPI_LONG_LONG -template int GetCount( Status& status ); -template int GetCount( Status& status ); +template int GetCount < long long int >(Status & status); +template int GetCount < +unsigned long long >(Status & status); #endif -template int GetCount( Status& status ); -template int GetCount( Status& status ); -template int GetCount>( Status& status ); -template int GetCount>( Status& status ); - -template -void TaggedSend( const R* buf, int count, int to, int tag, Comm comm ) -{ - DEBUG_ONLY(CallStackEntry cse("mpi::Send")) - SafeMpi( - MPI_Send( const_cast(buf), count, TypeMap(), to, tag, comm.comm ) - ); +template int GetCount < float >(Status & status); +template int GetCount < double >(Status & status); +template int GetCount < Complex < +float >>(Status & status); +template int GetCount < Complex < +double >>(Status & status); + +template < typename R > +void TaggedSend (const R * buf, int count, int to, + int tag, Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Send")) + SafeMpi (MPI_Send + (const_cast < R * >(buf), count, + TypeMap < R > (), to, tag, comm.comm)); } -template -void TaggedSend( const Complex* buf, int count, int to, int tag, Comm comm ) +template < typename R > +void TaggedSend (const Complex < R > *buf, int count, + int to, int tag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Send")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Send")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Send - ( const_cast*>(buf), 2*count, TypeMap(), to, - tag, comm.comm ) ); + (MPI_Send + (const_cast < Complex < R > *>(buf), 2 * count, + TypeMap < R > (), to, tag, comm.comm)); #else SafeMpi - ( MPI_Send - ( const_cast*>(buf), count, - TypeMap>(), to, tag, comm.comm ) ); + (MPI_Send + (const_cast < Complex < R > *>(buf), count, + TypeMap < Complex < R >> (), to, tag, + comm.comm)); #endif } -template void TaggedSend( const byte* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const int* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const unsigned* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const long int* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const unsigned long* buf, int count, int to, int tag, Comm comm ); +template void TaggedSend (const byte * buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const int *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const unsigned *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const long int *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const unsigned long *buf, + int count, int to, int tag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedSend( const long long int* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const unsigned long long* buf, int count, int to, int tag, Comm comm ); +template void TaggedSend (const long long int *buf, + int count, int to, int tag, + Comm comm); +template void TaggedSend (const unsigned long long *buf, + int count, int to, int tag, + Comm comm); #endif -template void TaggedSend( const float* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const double* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const Complex* buf, int count, int to, int tag, Comm comm ); -template void TaggedSend( const Complex* buf, int count, int to, int tag, Comm comm ); - -template -void Send( const T* buf, int count, int to, Comm comm ) -{ TaggedSend( buf, count, to, 0, comm ); } +template void TaggedSend (const float *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const double *buf, int count, + int to, int tag, Comm comm); +template void TaggedSend (const Complex < float >*buf, + int count, int to, int tag, + Comm comm); +template void TaggedSend (const Complex < double >*buf, + int count, int to, int tag, + Comm comm); + +template < typename T > +void Send (const T * buf, int count, int to, + Comm comm) +{ + TaggedSend (buf, count, to, 0, comm); +} -template void Send( const byte* buf, int count, int to, Comm comm ); -template void Send( const int* buf, int count, int to, Comm comm ); -template void Send( const unsigned* buf, int count, int to, Comm comm ); -template void Send( const long int* buf, int count, int to, Comm comm ); -template void Send( const unsigned long* buf, int count, int to, Comm comm ); +template void Send (const byte * buf, int count, int to, + Comm comm); +template void Send (const int *buf, int count, int to, + Comm comm); +template void Send (const unsigned *buf, int count, + int to, Comm comm); +template void Send (const long int *buf, int count, + int to, Comm comm); +template void Send (const unsigned long *buf, int count, + int to, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Send( const long long int* buf, int count, int to, Comm comm ); -template void Send( const unsigned long long* buf, int count, int to, Comm comm ); +template void Send (const long long int *buf, int count, + int to, Comm comm); +template void Send (const unsigned long long *buf, + int count, int to, Comm comm); #endif -template void Send( const float* buf, int count, int to, Comm comm ); -template void Send( const double* buf, int count, int to, Comm comm ); -template void Send( const Complex* buf, int count, int to, Comm comm ); -template void Send( const Complex* buf, int count, int to, Comm comm ); - -template -void TaggedSend( T b, int to, int tag, Comm comm ) -{ TaggedSend( &b, 1, to, tag, comm ); } +template void Send (const float *buf, int count, int to, + Comm comm); +template void Send (const double *buf, int count, int to, + Comm comm); +template void Send (const Complex < float >*buf, + int count, int to, Comm comm); +template void Send (const Complex < double >*buf, + int count, int to, Comm comm); + +template < typename T > +void TaggedSend (T b, int to, int tag, Comm comm) +{ + TaggedSend (&b, 1, to, tag, comm); +} -template void TaggedSend( byte b, int to, int tag, Comm comm ); -template void TaggedSend( int b, int to, int tag, Comm comm ); -template void TaggedSend( unsigned b, int to, int tag, Comm comm ); -template void TaggedSend( long int b, int to, int tag, Comm comm ); -template void TaggedSend( unsigned long b, int to, int tag, Comm comm ); +template void TaggedSend (byte b, int to, int tag, + Comm comm); +template void TaggedSend (int b, int to, int tag, + Comm comm); +template void TaggedSend (unsigned b, int to, int tag, + Comm comm); +template void TaggedSend (long int b, int to, int tag, + Comm comm); +template void TaggedSend (unsigned long b, int to, + int tag, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedSend( long long int b, int to, int tag, Comm comm ); -template void TaggedSend( unsigned long long b, int to, int tag, Comm comm ); +template void TaggedSend (long long int b, int to, + int tag, Comm comm); +template void TaggedSend (unsigned long long b, int to, + int tag, Comm comm); #endif -template void TaggedSend( float b, int to, int tag, Comm comm ); -template void TaggedSend( double b, int to, int tag, Comm comm ); -template void TaggedSend( Complex b, int to, int tag, Comm comm ); -template void TaggedSend( Complex b, int to, int tag, Comm comm ); +template void TaggedSend (float b, int to, int tag, + Comm comm); +template void TaggedSend (double b, int to, int tag, + Comm comm); +template void TaggedSend (Complex < float >b, int to, + int tag, Comm comm); +template void TaggedSend (Complex < double >b, int to, + int tag, Comm comm); + +template < typename T > void Send (T b, int to, Comm comm) +{ + TaggedSend (b, to, 0, comm); +} -template -void Send( T b, int to, Comm comm ) -{ TaggedSend( b, to, 0, comm ); } +template void Send (byte b, int to, Comm comm); +template void Send (int b, int to, Comm comm); +template void Send (unsigned b, int to, Comm comm); +template void Send (long int b, int to, Comm comm); +template void Send (unsigned long b, int to, Comm comm); -template void Send( byte b, int to, Comm comm ); -template void Send( int b, int to, Comm comm ); -template void Send( unsigned b, int to, Comm comm ); -template void Send( long int b, int to, Comm comm ); -template void Send( unsigned long b, int to, Comm comm ); #ifdef EL_HAVE_MPI_LONG_LONG -template void Send( long long int b, int to, Comm comm ); -template void Send( unsigned long long b, int to, Comm comm ); +template void Send (long long int b, int to, Comm comm); +template void Send (unsigned long long b, int to, + Comm comm); #endif -template void Send( float b, int to, Comm comm ); -template void Send( double b, int to, Comm comm ); -template void Send( Complex b, int to, Comm comm ); -template void Send( Complex b, int to, Comm comm ); - -template +template void Send (float b, int to, Comm comm); +template void Send (double b, int to, Comm comm); +template void Send (Complex < float >b, int to, + Comm comm); +template void Send (Complex < double >b, int to, + Comm comm); + +template < typename R > void TaggedISend -( const R* buf, int count, int to, int tag, Comm comm, Request& request ) -{ - DEBUG_ONLY(CallStackEntry cse("mpi::ISend")) +(const R * buf, int count, int to, int tag, Comm comm, + Request & request) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::ISend")) SafeMpi - ( MPI_Isend - ( const_cast(buf), count, TypeMap(), to, - tag, comm.comm, &request ) ); + (MPI_Isend + (const_cast < R * >(buf), count, + TypeMap < R > (), to, tag, comm.comm, + &request)); } -template +template < typename R > void TaggedISend -( const Complex* buf, int count, int to, int tag, Comm comm, - Request& request ) +(const Complex < R > *buf, int count, int to, int tag, + Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::ISend")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ISend")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Isend - ( const_cast*>(buf), 2*count, TypeMap(), to, tag, comm.comm, - &request ) ); + (MPI_Isend + (const_cast < Complex < R > *>(buf), 2 * count, + TypeMap < R > (), to, tag, comm.comm, + &request)); #else SafeMpi - ( MPI_Isend - ( const_cast*>(buf), count, - TypeMap>(), to, tag, comm.comm, &request ) ); + (MPI_Isend + (const_cast < Complex < R > *>(buf), count, + TypeMap < Complex < R >> (), to, tag, comm.comm, + &request)); #endif } -template void TaggedISend( const byte* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const unsigned* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const long int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const unsigned long* buf, int count, int to, int tag, Comm comm, Request& request ); +template void TaggedISend (const byte * buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const int *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const unsigned *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const long int *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const unsigned long *buf, + int count, int to, int tag, + Comm comm, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedISend( const long long int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const unsigned long long* buf, int count, int to, int tag, Comm comm, Request& request ); +template void TaggedISend (const long long int *buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (const unsigned long long *buf, + int count, int to, int tag, + Comm comm, Request & request); #endif -template void TaggedISend( const float* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const double* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const Complex* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( const Complex* buf, int count, int to, int tag, Comm comm, Request& request ); - -template +template void TaggedISend (const float *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const double *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISend (const Complex < float >*buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (const Complex < double >*buf, + int count, int to, int tag, + Comm comm, Request & request); + +template < typename T > void ISend -( const T* buf, int count, int to, Comm comm, Request& request ) -{ TaggedISend( buf, count, to, 0, comm, request ); } +(const T * buf, int count, int to, Comm comm, + Request & request) +{ + TaggedISend (buf, count, to, 0, comm, request); +} -template void ISend( const byte* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const int* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const unsigned* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const long int* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const unsigned long* buf, int count, int to, Comm comm, Request& request ); +template void ISend (const byte * buf, int count, int to, + Comm comm, Request & request); +template void ISend (const int *buf, int count, int to, + Comm comm, Request & request); +template void ISend (const unsigned *buf, int count, + int to, Comm comm, + Request & request); +template void ISend (const long int *buf, int count, + int to, Comm comm, + Request & request); +template void ISend (const unsigned long *buf, int count, + int to, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void ISend( const long long int* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const unsigned long long* buf, int count, int to, Comm comm, Request& request ); +template void ISend (const long long int *buf, int count, + int to, Comm comm, + Request & request); +template void ISend (const unsigned long long *buf, + int count, int to, Comm comm, + Request & request); #endif -template void ISend( const float* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const double* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const Complex* buf, int count, int to, Comm comm, Request& request ); -template void ISend( const Complex* buf, int count, int to, Comm comm, Request& request ); - -template -void TaggedISend( T b, int to, int tag, Comm comm, Request& request ) -{ TaggedISend( &b, 1, to, tag, comm, request ); } +template void ISend (const float *buf, int count, int to, + Comm comm, Request & request); +template void ISend (const double *buf, int count, int to, + Comm comm, Request & request); +template void ISend (const Complex < float >*buf, + int count, int to, Comm comm, + Request & request); +template void ISend (const Complex < double >*buf, + int count, int to, Comm comm, + Request & request); + +template < typename T > +void TaggedISend (T b, int to, int tag, Comm comm, + Request & request) +{ + TaggedISend (&b, 1, to, tag, comm, request); +} -template void TaggedISend( byte buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( int buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( unsigned buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( long int buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( unsigned long buf, int to, int tag, Comm comm, Request& request ); +template void TaggedISend (byte buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (int buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (unsigned buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (long int buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (unsigned long buf, int to, + int tag, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedISend( long long int buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( unsigned long long buf, int to, int tag, Comm comm, Request& request ); +template void TaggedISend (long long int buf, int to, + int tag, Comm comm, + Request & request); +template void TaggedISend (unsigned long long buf, int to, + int tag, Comm comm, + Request & request); #endif -template void TaggedISend( float buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( double buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( Complex buf, int to, int tag, Comm comm, Request& request ); -template void TaggedISend( Complex buf, int to, int tag, Comm comm, Request& request ); - -template -void ISend( T b, int to, Comm comm, Request& request ) -{ TaggedISend( b, to, 0, comm, request ); } +template void TaggedISend (float buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (double buf, int to, int tag, + Comm comm, Request & request); +template void TaggedISend (Complex < float >buf, int to, + int tag, Comm comm, + Request & request); +template void TaggedISend (Complex < double >buf, int to, + int tag, Comm comm, + Request & request); + +template < typename T > +void ISend (T b, int to, Comm comm, Request & request) +{ + TaggedISend (b, to, 0, comm, request); +} -template void ISend( byte buf, int to, Comm comm, Request& request ); -template void ISend( int buf, int to, Comm comm, Request& request ); -template void ISend( unsigned buf, int to, Comm comm, Request& request ); -template void ISend( long int buf, int to, Comm comm, Request& request ); -template void ISend( unsigned long buf, int to, Comm comm, Request& request ); +template void ISend (byte buf, int to, Comm comm, + Request & request); +template void ISend (int buf, int to, Comm comm, + Request & request); +template void ISend (unsigned buf, int to, Comm comm, + Request & request); +template void ISend (long int buf, int to, Comm comm, + Request & request); +template void ISend (unsigned long buf, int to, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void ISend( long long int buf, int to, Comm comm, Request& request ); -template void ISend( unsigned long long buf, int to, Comm comm, Request& request ); +template void ISend (long long int buf, int to, Comm comm, + Request & request); +template void ISend (unsigned long long buf, int to, + Comm comm, Request & request); #endif -template void ISend( float buf, int to, Comm comm, Request& request ); -template void ISend( double buf, int to, Comm comm, Request& request ); -template void ISend( Complex buf, int to, Comm comm, Request& request ); -template void ISend( Complex buf, int to, Comm comm, Request& request ); - -template +template void ISend (float buf, int to, Comm comm, + Request & request); +template void ISend (double buf, int to, Comm comm, + Request & request); +template void ISend (Complex < float >buf, int to, + Comm comm, Request & request); +template void ISend (Complex < double >buf, int to, + Comm comm, Request & request); + +template < typename R > void TaggedISSend -( const R* buf, int count, int to, int tag, Comm comm, Request& request ) +(const R * buf, int count, int to, int tag, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::ISSend")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ISSend")) + SafeMpi - ( MPI_Issend - ( const_cast(buf), count, TypeMap(), to, - tag, comm.comm, &request ) ); + (MPI_Issend + (const_cast < R * >(buf), count, + TypeMap < R > (), to, tag, comm.comm, + &request)); } -template +template < typename R > void TaggedISSend -( const Complex* buf, int count, int to, int tag, Comm comm, - Request& request ) +(const Complex < R > *buf, int count, int to, int tag, + Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::ISSend")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ISSend")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Issend - ( const_cast*>(buf), 2*count, TypeMap(), to, tag, comm.comm, - &request ) ); + (MPI_Issend + (const_cast < Complex < R > *>(buf), 2 * count, + TypeMap < R > (), to, tag, comm.comm, + &request)); #else SafeMpi - ( MPI_Issend - ( const_cast*>(buf), count, - TypeMap>(), to, tag, comm.comm, &request ) ); + (MPI_Issend + (const_cast < Complex < R > *>(buf), count, + TypeMap < Complex < R >> (), to, tag, comm.comm, + &request)); #endif } -template void TaggedISSend( const byte* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const unsigned* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const long int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const unsigned long* buf, int count, int to, int tag, Comm comm, Request& request ); +template void TaggedISSend (const byte * buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISSend (const int *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISSend (const unsigned *buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (const long int *buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (const unsigned long *buf, + int count, int to, int tag, + Comm comm, Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedISSend( const long long int* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const unsigned long long* buf, int count, int to, int tag, Comm comm, Request& request ); +template void TaggedISSend (const long long int *buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (const unsigned long long *buf, + int count, int to, int tag, + Comm comm, Request & request); #endif -template void TaggedISSend( const float* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const double* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const Complex* buf, int count, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( const Complex* buf, int count, int to, int tag, Comm comm, Request& request ); +template void TaggedISSend (const float *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISSend (const double *buf, int count, + int to, int tag, Comm comm, + Request & request); +template void TaggedISSend (const Complex < float >*buf, + int count, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (const Complex < double >*buf, + int count, int to, int tag, + Comm comm, Request & request); + +template < typename T > +void ISSend (const T * buf, int count, int to, + Comm comm, Request & request) +{ + TaggedISSend (buf, count, to, 0, comm, request); +} -template -void ISSend( const T* buf, int count, int to, Comm comm, Request& request ) -{ TaggedISSend( buf, count, to, 0, comm, request ); } +template void ISSend (const byte * buf, int count, int to, + Comm comm, Request & request); +template void ISSend (const int *buf, int count, int to, + Comm comm, Request & request); +template void ISSend (const unsigned *buf, int count, + int to, Comm comm, + Request & request); +template void ISSend (const long int *buf, int count, + int to, Comm comm, + Request & request); +template void ISSend (const unsigned long *buf, int count, + int to, Comm comm, + Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void ISSend (const long long int *buf, int count, + int to, Comm comm, + Request & request); +template void ISSend (const unsigned long long *buf, + int count, int to, Comm comm, + Request & request); +#endif +template void ISSend (const float *buf, int count, int to, + Comm comm, Request & request); +template void ISSend (const double *buf, int count, + int to, Comm comm, + Request & request); +template void ISSend (const Complex < float >*buf, + int count, int to, Comm comm, + Request & request); +template void ISSend (const Complex < double >*buf, + int count, int to, Comm comm, + Request & request); + +template < typename T > +void TaggedISSend (T b, int to, int tag, Comm comm, + Request & request) +{ + TaggedISSend (&b, 1, to, tag, comm, request); +} -template void ISSend( const byte* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const int* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const unsigned* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const long int* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const unsigned long* buf, int count, int to, Comm comm, Request& request ); +template void TaggedISSend (byte b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (int b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (unsigned b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (long int b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (unsigned long b, int to, + int tag, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void ISSend( const long long int* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const unsigned long long* buf, int count, int to, Comm comm, Request& request ); +template void TaggedISSend (long long int b, int to, + int tag, Comm comm, + Request & request); +template void TaggedISSend (unsigned long long b, int to, + int tag, Comm comm, + Request & request); #endif -template void ISSend( const float* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const double* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const Complex* buf, int count, int to, Comm comm, Request& request ); -template void ISSend( const Complex* buf, int count, int to, Comm comm, Request& request ); +template void TaggedISSend (float b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (double b, int to, int tag, + Comm comm, Request & request); +template void TaggedISSend (Complex < float >b, int to, + int tag, Comm comm, + Request & request); +template void TaggedISSend (Complex < double >b, int to, + int tag, Comm comm, + Request & request); + +template < typename R > +void TaggedRecv (R * buf, int count, int from, + int tag, Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) Status + status; + SafeMpi (MPI_Recv + (buf, count, TypeMap < R > (), from, tag, + comm.comm, &status)); +} -template -void TaggedISSend( T b, int to, int tag, Comm comm, Request& request ) -{ TaggedISSend( &b, 1, to, tag, comm, request ); } +template < typename R > +void TaggedRecv (Complex < R > *buf, int count, + int from, int tag, Comm comm) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) Status + status; +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi + (MPI_Recv + (buf, 2 * count, TypeMap < R > (), from, tag, + comm.comm, &status)); +#else + SafeMpi + (MPI_Recv + (buf, count, TypeMap < Complex < R >> (), from, + tag, comm.comm, &status)); +#endif +} -template void TaggedISSend( byte b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( int b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( unsigned b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( long int b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( unsigned long b, int to, int tag, Comm comm, Request& request ); +template void TaggedRecv (byte * buf, int count, int from, + int tag, Comm comm); +template void TaggedRecv (int *buf, int count, int from, + int tag, Comm comm); +template void TaggedRecv (unsigned *buf, int count, + int from, int tag, Comm comm); +template void TaggedRecv (long int *buf, int count, + int from, int tag, Comm comm); +template void TaggedRecv (unsigned long *buf, int count, + int from, int tag, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedISSend( long long int b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( unsigned long long b, int to, int tag, Comm comm, Request& request ); +template void TaggedRecv (long long int *buf, int count, + int from, int tag, Comm comm); +template void TaggedRecv (unsigned long long *buf, + int count, int from, int tag, + Comm comm); #endif -template void TaggedISSend( float b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( double b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( Complex b, int to, int tag, Comm comm, Request& request ); -template void TaggedISSend( Complex b, int to, int tag, Comm comm, Request& request ); +template void TaggedRecv (float *buf, int count, int from, + int tag, Comm comm); +template void TaggedRecv (double *buf, int count, + int from, int tag, Comm comm); +template void TaggedRecv (Complex < float >*buf, + int count, int from, int tag, + Comm comm); +template void TaggedRecv (Complex < double >*buf, + int count, int from, int tag, + Comm comm); + +template < typename R > +void TaggedRecvS (R * buf, int count, int from, + int tag, Comm comm, Status & status) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) + SafeMpi (MPI_Recv + (buf, count, TypeMap < R > (), from, tag, + comm.comm, &status)); +} -template -void TaggedRecv( R* buf, int count, int from, int tag, Comm comm ) +template < typename R > +void TaggedRecvS (Complex < R > *buf, int count, + int from, int tag, Comm comm, Status & status) { - DEBUG_ONLY(CallStackEntry cse("mpi::Recv")) - Status status; + DEBUG_ONLY (CallStackEntry cse ("mpi::Recv")) +#ifdef EL_AVOID_COMPLEX_MPI + SafeMpi + (MPI_Recv + (buf, 2 * count, TypeMap < R > (), from, tag, + comm.comm, &status)); +#else SafeMpi - ( MPI_Recv( buf, count, TypeMap(), from, tag, comm.comm, &status ) ); + (MPI_Recv + (buf, count, TypeMap < Complex < R >> (), from, + tag, comm.comm, &status)); +#endif } -template -void TaggedRecv( Complex* buf, int count, int from, int tag, Comm comm ) +template void TaggedRecvS (byte * buf, int count, int from, + int tag, Comm comm, Status & status); +template void TaggedRecvS (int *buf, int count, int from, + int tag, Comm comm, Status & status); +template void TaggedRecvS (unsigned *buf, int count, + int from, int tag, Comm comm, Status & status); +template void TaggedRecvS (long int *buf, int count, + int from, int tag, Comm comm, Status & status); +template void TaggedRecvS (unsigned long *buf, int count, + int from, int tag, Comm comm, Status & status); +#ifdef EL_HAVE_MPI_LONG_LONG +template void TaggedRecvS (long long int *buf, int count, + int from, int tag, Comm comm, Status & status); +template void TaggedRecvS (unsigned long long *buf, + int count, int from, int tag, + Comm comm, Status & status); +#endif +template void TaggedRecvS (float *buf, int count, int from, + int tag, Comm comm, Status & status); +template void TaggedRecvS (double *buf, int count, + int from, int tag, Comm comm, Status & status); +template void TaggedRecvS (Complex < float >*buf, + int count, int from, int tag, + Comm comm, Status & status); +template void TaggedRecvS (Complex < double >*buf, + int count, int from, int tag, + Comm comm, Status & status); + +// matching recv +template < typename R > +void TaggedMrecv (R * buf, int count, Message & msg) +{ + DEBUG_ONLY (CallStackEntry cse ("mpi::Mrecv")) + Status status; + SafeMpi (MPI_Mrecv + (buf, count, TypeMap < R > (), + &msg, &status)); +} + +template < typename R > +void TaggedMrecv (Complex < R > *buf, int count, Message & msg) { - DEBUG_ONLY(CallStackEntry cse("mpi::Recv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Mrecv")) Status status; #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Recv( buf, 2*count, TypeMap(), from, tag, comm.comm, &status ) ); + (MPI_Mrecv + (buf, 2 * count, TypeMap < R > (), &msg, &status)); #else SafeMpi - ( MPI_Recv - ( buf, count, TypeMap>(), from, tag, comm.comm, &status ) ); + (MPI_Mrecv + (buf, count, TypeMap < Complex < R >> (), + &msg, &status)); #endif } -template void TaggedRecv( byte* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( int* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( unsigned* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( long int* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( unsigned long* buf, int count, int from, int tag, Comm comm ); +template void TaggedMrecv (byte * buf, int count, Message & msg); +template void TaggedMrecv (int *buf, int count, Message & msg); +template void TaggedMrecv (unsigned *buf, int count, Message & msg); +template void TaggedMrecv (long int *buf, int count, Message & msg); +template void TaggedMrecv (unsigned long *buf, int count, Message & msg); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedRecv( long long int* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( unsigned long long* buf, int count, int from, int tag, Comm comm ); +template void TaggedMrecv (long long int *buf, int count, Message & msg); +template void TaggedMrecv (unsigned long long *buf, + int count, Message & msg); #endif -template void TaggedRecv( float* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( double* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( Complex* buf, int count, int from, int tag, Comm comm ); -template void TaggedRecv( Complex* buf, int count, int from, int tag, Comm comm ); - -template -void Recv( T* buf, int count, int from, Comm comm ) -{ TaggedRecv( buf, count, from, mpi::ANY_TAG, comm ); } +template void TaggedMrecv (float *buf, int count, Message & msg); +template void TaggedMrecv (double *buf, int count, Message & msg); +template void TaggedMrecv (Complex < float >*buf, + int count, Message & msg); +template void TaggedMrecv (Complex < double >*buf, + int count, Message & msg); + +template < typename T > +void Recv (T * buf, int count, int from, Comm comm) +{ + TaggedRecv (buf, count, from, mpi::ANY_TAG, comm); +} -template void Recv( byte* buf, int count, int from, Comm comm ); -template void Recv( int* buf, int count, int from, Comm comm ); -template void Recv( unsigned* buf, int count, int from, Comm comm ); -template void Recv( long int* buf, int count, int from, Comm comm ); -template void Recv( unsigned long* buf, int count, int from, Comm comm ); +template void Recv (byte * buf, int count, int from, + Comm comm); +template void Recv (int *buf, int count, int from, + Comm comm); +template void Recv (unsigned *buf, int count, int from, + Comm comm); +template void Recv (long int *buf, int count, int from, + Comm comm); +template void Recv (unsigned long *buf, int count, + int from, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Recv( long long int* buf, int count, int from, Comm comm ); -template void Recv( unsigned long long* buf, int count, int from, Comm comm ); +template void Recv (long long int *buf, int count, + int from, Comm comm); +template void Recv (unsigned long long *buf, int count, + int from, Comm comm); #endif -template void Recv( float* buf, int count, int from, Comm comm ); -template void Recv( double* buf, int count, int from, Comm comm ); -template void Recv( Complex* buf, int count, int from, Comm comm ); -template void Recv( Complex* buf, int count, int from, Comm comm ); +template void Recv (float *buf, int count, int from, + Comm comm); +template void Recv (double *buf, int count, int from, + Comm comm); +template void Recv (Complex < float >*buf, int count, + int from, Comm comm); +template void Recv (Complex < double >*buf, int count, + int from, Comm comm); + +template < typename T > T TaggedRecv (int from, int tag, + Comm comm) +{ + T b; -template -T TaggedRecv( int from, int tag, Comm comm ) -{ T b; TaggedRecv( &b, 1, from, tag, comm ); return b; } + TaggedRecv (&b, 1, from, tag, comm); + return b; +} -template byte TaggedRecv( int from, int tag, Comm comm ); -template int TaggedRecv( int from, int tag, Comm comm ); -template unsigned TaggedRecv( int from, int tag, Comm comm ); -template long int TaggedRecv( int from, int tag, Comm comm ); -template unsigned long TaggedRecv( int from, int tag, Comm comm ); +template byte TaggedRecv (int from, int tag, Comm comm); +template int TaggedRecv (int from, int tag, Comm comm); +template unsigned TaggedRecv (int from, int tag, + Comm comm); +template long int TaggedRecv (int from, int tag, + Comm comm); +template unsigned long TaggedRecv (int from, int tag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int TaggedRecv( int from, int tag, Comm comm ); -template unsigned long long TaggedRecv( int from, int tag, Comm comm ); +template long long int TaggedRecv (int from, int tag, + Comm comm); +template unsigned long long TaggedRecv (int from, int tag, + Comm comm); #endif -template float TaggedRecv( int from, int tag, Comm comm ); -template double TaggedRecv( int from, int tag, Comm comm ); -template Complex TaggedRecv( int from, int tag, Comm comm ); -template Complex TaggedRecv( int from, int tag, Comm comm ); +template float TaggedRecv (int from, int tag, Comm comm); +template double TaggedRecv (int from, int tag, Comm comm); +template Complex < float >TaggedRecv (int from, int tag, + Comm comm); +template Complex < double >TaggedRecv (int from, int tag, + Comm comm); + +template < typename T > T Recv (int from, Comm comm) +{ + return TaggedRecv < T > (from, mpi::ANY_TAG, comm); +} -template -T Recv( int from, Comm comm ) -{ return TaggedRecv( from, mpi::ANY_TAG, comm ); } +template byte Recv (int from, Comm comm); +template int Recv (int from, Comm comm); +template unsigned Recv (int from, Comm comm); +template long int Recv (int from, Comm comm); +template unsigned long Recv (int from, Comm comm); -template byte Recv( int from, Comm comm ); -template int Recv( int from, Comm comm ); -template unsigned Recv( int from, Comm comm ); -template long int Recv( int from, Comm comm ); -template unsigned long Recv( int from, Comm comm ); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int Recv( int from, Comm comm ); -template unsigned long long Recv( int from, Comm comm ); +template long long int Recv (int from, Comm comm); +template unsigned long long Recv (int from, Comm comm); #endif -template float Recv( int from, Comm comm ); -template double Recv( int from, Comm comm ); -template Complex Recv( int from, Comm comm ); -template Complex Recv( int from, Comm comm ); +template float Recv (int from, Comm comm); +template double Recv (int from, Comm comm); +template Complex < float >Recv (int from, Comm comm); +template Complex < double >Recv (int from, Comm comm); -template +template < typename R > void TaggedIRecv -( R* buf, int count, int from, int tag, Comm comm, Request& request ) +(R * buf, int count, int from, int tag, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IRecv")) SafeMpi - ( MPI_Irecv( buf, count, TypeMap(), from, tag, comm.comm, &request ) ); + (MPI_Irecv + (buf, count, TypeMap < R > (), from, tag, + comm.comm, &request)); } -template +template < typename R > void TaggedIRecv -( Complex* buf, int count, int from, int tag, Comm comm, Request& request ) +(Complex < R > *buf, int count, int from, int tag, + Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IRecv")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Irecv( buf, 2*count, TypeMap(), from, tag, comm.comm, &request ) ); + (MPI_Irecv + (buf, 2 * count, TypeMap < R > (), from, tag, + comm.comm, &request)); #else SafeMpi - ( MPI_Irecv - ( buf, count, TypeMap>(), from, tag, comm.comm, &request ) ); + (MPI_Irecv + (buf, count, TypeMap < Complex < R >> (), from, + tag, comm.comm, &request)); #endif } -template void TaggedIRecv( byte* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( int* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( unsigned* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( long int* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( unsigned long* buf, int count, int from, int tag, Comm comm, Request& request ); +template void TaggedIRecv (byte * buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (int *buf, int count, int from, + int tag, Comm comm, + Request & request); +template void TaggedIRecv (unsigned *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (long int *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (unsigned long *buf, int count, + int from, int tag, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void TaggedIRecv( long long int* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( unsigned long long* buf, int count, int from, int tag, Comm comm, Request& request ); +template void TaggedIRecv (long long int *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (unsigned long long *buf, + int count, int from, int tag, + Comm comm, Request & request); #endif -template void TaggedIRecv( float* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( double* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( Complex* buf, int count, int from, int tag, Comm comm, Request& request ); -template void TaggedIRecv( Complex* buf, int count, int from, int tag, Comm comm, Request& request ); - -template -void IRecv( T* buf, int count, int from, Comm comm, Request& request ) -{ TaggedIRecv( buf, count, from, mpi::ANY_TAG, comm, request ); } +template void TaggedIRecv (float *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (double *buf, int count, + int from, int tag, Comm comm, + Request & request); +template void TaggedIRecv (Complex < float >*buf, + int count, int from, int tag, + Comm comm, Request & request); +template void TaggedIRecv (Complex < double >*buf, + int count, int from, int tag, + Comm comm, Request & request); + +template < typename T > +void IRecv (T * buf, int count, int from, Comm comm, + Request & request) +{ + TaggedIRecv (buf, count, from, mpi::ANY_TAG, comm, + request); +} -template void IRecv( byte* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( int* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( unsigned* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( long int* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( unsigned long* buf, int count, int from, Comm comm, Request& request ); +template void IRecv (byte * buf, int count, int from, + Comm comm, Request & request); +template void IRecv (int *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (unsigned *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (long int *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (unsigned long *buf, int count, + int from, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void IRecv( long long int* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( unsigned long long* buf, int count, int from, Comm comm, Request& request ); +template void IRecv (long long int *buf, int count, + int from, Comm comm, + Request & request); +template void IRecv (unsigned long long *buf, int count, + int from, Comm comm, + Request & request); #endif -template void IRecv( float* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( double* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( Complex* buf, int count, int from, Comm comm, Request& request ); -template void IRecv( Complex* buf, int count, int from, Comm comm, Request& request ); +template void IRecv (float *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (double *buf, int count, int from, + Comm comm, Request & request); +template void IRecv (Complex < float >*buf, int count, + int from, Comm comm, + Request & request); +template void IRecv (Complex < double >*buf, int count, + int from, Comm comm, + Request & request); + +template < typename T > +T TaggedIRecv (int from, int tag, Comm comm, + Request & request) +{ + T b; -template -T TaggedIRecv( int from, int tag, Comm comm, Request& request ) -{ T b; TaggedIRecv( &b, 1, from, tag, comm, request ); return b; } + TaggedIRecv (&b, 1, from, tag, comm, request); + return b; +} -template byte TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template int TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template unsigned TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template long int TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template unsigned long TaggedIRecv( int from, int tag, Comm comm, Request& request ); +template byte TaggedIRecv (int from, int tag, Comm comm, + Request & request); +template int TaggedIRecv (int from, int tag, Comm comm, + Request & request); +template unsigned TaggedIRecv (int from, int tag, + Comm comm, + Request & request); +template long int TaggedIRecv (int from, int tag, + Comm comm, + Request & request); +template unsigned long TaggedIRecv (int from, int tag, + Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template unsigned long long TaggedIRecv( int from, int tag, Comm comm, Request& request ); +template long long int TaggedIRecv (int from, int tag, + Comm comm, + Request & request); +template unsigned long long TaggedIRecv (int from, + int tag, + Comm comm, + Request & + request); #endif -template float TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template double TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template Complex TaggedIRecv( int from, int tag, Comm comm, Request& request ); -template Complex TaggedIRecv( int from, int tag, Comm comm, Request& request ); - -template -T IRecv( int from, Comm comm, Request& request ) -{ return TaggedIRecv( from, mpi::ANY_TAG, comm, request ); } +template float TaggedIRecv (int from, int tag, Comm comm, + Request & request); +template double TaggedIRecv (int from, int tag, Comm comm, + Request & request); +template Complex < float >TaggedIRecv (int from, int tag, + Comm comm, + Request & request); +template Complex < double >TaggedIRecv (int from, int tag, + Comm comm, + Request & + request); + +template < typename T > +T IRecv (int from, Comm comm, Request & request) +{ + return TaggedIRecv < T > (from, mpi::ANY_TAG, comm, + request); +} -template byte IRecv( int from, Comm comm, Request& request ); -template int IRecv( int from, Comm comm, Request& request ); -template unsigned IRecv( int from, Comm comm, Request& request ); -template long int IRecv( int from, Comm comm, Request& request ); -template unsigned long IRecv( int from, Comm comm, Request& request ); +template byte IRecv (int from, Comm comm, + Request & request); +template int IRecv (int from, Comm comm, + Request & request); +template unsigned IRecv (int from, Comm comm, + Request & request); +template long int IRecv (int from, Comm comm, + Request & request); +template unsigned long IRecv (int from, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int IRecv( int from, Comm comm, Request& request ); -template unsigned long long IRecv( int from, Comm comm, Request& request ); +template long long int IRecv (int from, Comm comm, + Request & request); +template unsigned long long IRecv (int from, Comm comm, + Request & request); #endif -template float IRecv( int from, Comm comm, Request& request ); -template double IRecv( int from, Comm comm, Request& request ); -template Complex IRecv( int from, Comm comm, Request& request ); -template Complex IRecv( int from, Comm comm, Request& request ); - -template +template float IRecv (int from, Comm comm, + Request & request); +template double IRecv (int from, Comm comm, + Request & request); +template Complex < float >IRecv (int from, Comm comm, + Request & request); +template Complex < double >IRecv (int from, Comm comm, + Request & request); + +template < typename R > void TaggedSendRecv -( const R* sbuf, int sc, int to, int stag, - R* rbuf, int rc, int from, int rtag, Comm comm ) +(const R * sbuf, int sc, int to, int stag, + R * rbuf, int rc, int from, int rtag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::SendRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::SendRecv")) Status status; - SafeMpi - ( MPI_Sendrecv - ( const_cast(sbuf), sc, TypeMap(), to, stag, - rbuf, rc, TypeMap(), from, rtag, - comm.comm, &status ) ); + + SafeMpi (MPI_Sendrecv + (const_cast < R * >(sbuf), sc, + TypeMap < R > (), to, stag, rbuf, rc, + TypeMap < R > (), from, rtag, comm.comm, + &status)); } -template +template < typename R > void TaggedSendRecv -( const Complex* sbuf, int sc, int to, int stag, - Complex* rbuf, int rc, int from, int rtag, Comm comm ) +(const Complex < R > *sbuf, int sc, int to, int stag, + Complex < R > *rbuf, int rc, int from, int rtag, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::SendRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::SendRecv")) Status status; + #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Sendrecv - ( const_cast*>(sbuf), 2*sc, TypeMap(), to, stag, - rbuf, 2*rc, TypeMap(), from, rtag, - comm.comm, &status ) ); + (MPI_Sendrecv + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), to, stag, rbuf, 2 * rc, + TypeMap < R > (), from, rtag, comm.comm, + &status)); #else SafeMpi - ( MPI_Sendrecv - ( const_cast*>(sbuf), - sc, TypeMap>(), to, stag, - rbuf, - rc, TypeMap>(), from, rtag, comm.comm, &status ) ); + (MPI_Sendrecv + (const_cast < Complex < R > *>(sbuf), + sc, TypeMap < Complex < R >> (), to, stag, + rbuf, + rc, TypeMap < Complex < R >> (), from, rtag, + comm.comm, &status)); #endif } template void TaggedSendRecv -( const byte* sbuf, int sc, int to, int stag, - byte* rbuf, int rc, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( const int* sbuf, int sc, int to, int stag, - int* rbuf, int rc, int from, int rtag, Comm comm ); +(const byte * sbuf, int sc, int to, int stag, + byte * rbuf, int rc, int from, int rtag, Comm comm); template void TaggedSendRecv -( const unsigned* sbuf, int sc, int to, int stag, - unsigned* rbuf, int rc, int from, int rtag, Comm comm ); +(const int *sbuf, int sc, int to, int stag, + int *rbuf, int rc, int from, int rtag, Comm comm); template void TaggedSendRecv -( const long int* sbuf, int sc, int to, int stag, - long int* rbuf, int rc, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( const unsigned long* sbuf, int sc, int to, int stag, - unsigned long* rbuf, int rc, int from, int rtag, Comm comm ); +(const unsigned *sbuf, int sc, int to, int stag, + unsigned *rbuf, int rc, int from, int rtag, + Comm comm); +template void TaggedSendRecv (const long int *sbuf, + int sc, int to, int stag, + long int *rbuf, int rc, + int from, int rtag, + Comm comm); +template void TaggedSendRecv (const unsigned long *sbuf, + int sc, int to, int stag, + unsigned long *rbuf, int rc, + int from, int rtag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void TaggedSendRecv -( const long long int* sbuf, int sc, int to, int stag, - long long int* rbuf, int rc, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( const unsigned long long* sbuf, int sc, int to, int stag, - unsigned long long* rbuf, int rc, int from, int rtag, Comm comm ); +(const long long int *sbuf, int sc, int to, int stag, + long long int *rbuf, int rc, int from, int rtag, + Comm comm); +template void TaggedSendRecv (const unsigned long long + *sbuf, int sc, int to, + int stag, + unsigned long long *rbuf, + int rc, int from, int rtag, + Comm comm); #endif template void TaggedSendRecv -( const float* sbuf, int sc, int to, int stag, - float* rbuf, int rc, int from, int rtag, Comm comm ); +(const float *sbuf, int sc, int to, int stag, + float *rbuf, int rc, int from, int rtag, Comm comm); template void TaggedSendRecv -( const double* sbuf, int sc, int to, int stag, - double* rbuf, int rc, int from, int rtag, Comm comm ); +(const double *sbuf, int sc, int to, int stag, + double *rbuf, int rc, int from, int rtag, Comm comm); template void TaggedSendRecv -( const Complex* sbuf, int sc, int to, int stag, - Complex* rbuf, int rc, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( const Complex* sbuf, int sc, int to, int stag, - Complex* rbuf, int rc, int from, int rtag, Comm comm ); - -template +(const Complex < float >*sbuf, int sc, int to, + int stag, Complex < float >*rbuf, int rc, int from, + int rtag, Comm comm); +template void TaggedSendRecv (const Complex < + double >*sbuf, int sc, + int to, int stag, + Complex < double >*rbuf, + int rc, int from, int rtag, + Comm comm); + +template < typename T > void SendRecv -( const T* sbuf, int sc, int to, - T* rbuf, int rc, int from, Comm comm ) -{ TaggedSendRecv( sbuf, sc, to, 0, rbuf, rc, from, mpi::ANY_TAG, comm ); } +(const T * sbuf, int sc, int to, + T * rbuf, int rc, int from, Comm comm) +{ + TaggedSendRecv (sbuf, sc, to, 0, rbuf, rc, from, + mpi::ANY_TAG, comm); +} template void SendRecv -( const byte* sbuf, int sc, int to, - byte* rbuf, int rc, int from, Comm comm ); +(const byte * sbuf, int sc, int to, + byte * rbuf, int rc, int from, Comm comm); template void SendRecv -( const int* sbuf, int sc, int to, - int* rbuf, int rc, int from, Comm comm ); +(const int *sbuf, int sc, int to, + int *rbuf, int rc, int from, Comm comm); template void SendRecv -( const unsigned* sbuf, int sc, int to, - unsigned* rbuf, int rc, int from, Comm comm ); +(const unsigned *sbuf, int sc, int to, + unsigned *rbuf, int rc, int from, Comm comm); template void SendRecv -( const long int* sbuf, int sc, int to, - long int* rbuf, int rc, int from, Comm comm ); +(const long int *sbuf, int sc, int to, + long int *rbuf, int rc, int from, Comm comm); template void SendRecv -( const unsigned long* sbuf, int sc, int to, - unsigned long* rbuf, int rc, int from, Comm comm ); +(const unsigned long *sbuf, int sc, int to, + unsigned long *rbuf, int rc, int from, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void SendRecv -( const long long int* sbuf, int sc, int to, - long long int* rbuf, int rc, int from, Comm comm ); +(const long long int *sbuf, int sc, int to, + long long int *rbuf, int rc, int from, Comm comm); template void SendRecv -( const unsigned long long* sbuf, int sc, int to, - unsigned long long* rbuf, int rc, int from, Comm comm ); +(const unsigned long long *sbuf, int sc, int to, + unsigned long long *rbuf, int rc, int from, + Comm comm); #endif template void SendRecv -( const float* sbuf, int sc, int to, - float* rbuf, int rc, int from, Comm comm ); +(const float *sbuf, int sc, int to, + float *rbuf, int rc, int from, Comm comm); template void SendRecv -( const double* sbuf, int sc, int to, - double* rbuf, int rc, int from, Comm comm ); +(const double *sbuf, int sc, int to, + double *rbuf, int rc, int from, Comm comm); template void SendRecv -( const Complex* sbuf, int sc, int to, - Complex* rbuf, int rc, int from, Comm comm ); +(const Complex < float >*sbuf, int sc, int to, + Complex < float >*rbuf, int rc, int from, Comm comm); template void SendRecv -( const Complex* sbuf, int sc, int to, - Complex* rbuf, int rc, int from, Comm comm ); +(const Complex < double >*sbuf, int sc, int to, + Complex < double >*rbuf, int rc, int from, + Comm comm); -template -T TaggedSendRecv( T sb, int to, int stag, int from, int rtag, Comm comm ) -{ - T rb; - TaggedSendRecv( &sb, 1, to, stag, &rb, 1, from, rtag, comm ); - return rb; +template < typename T > +T TaggedSendRecv (T sb, int to, int stag, int from, + int rtag, Comm comm) +{ + T rb; + + TaggedSendRecv (&sb, 1, to, stag, &rb, 1, from, rtag, + comm); + return rb; } template byte TaggedSendRecv -( byte sb, int to, int stag, int from, int rtag, Comm comm ); -template int TaggedSendRecv -( int sb, int to, int stag, int from, int rtag, Comm comm ); -template unsigned TaggedSendRecv -( unsigned sb, int to, int stag, int from, int rtag, Comm comm ); -template long int TaggedSendRecv -( long int sb, int to, int stag, int from, int rtag, Comm comm ); -template unsigned long TaggedSendRecv -( unsigned long sb, int to, int stag, int from, int rtag, Comm comm ); +(byte sb, int to, int stag, int from, int rtag, + Comm comm); +template int TaggedSendRecv (int sb, int to, int stag, + int from, int rtag, + Comm comm); +template unsigned TaggedSendRecv (unsigned sb, int to, + int stag, int from, + int rtag, Comm comm); +template long int TaggedSendRecv (long int sb, int to, + int stag, int from, + int rtag, Comm comm); +template unsigned long TaggedSendRecv (unsigned long sb, + int to, int stag, + int from, int rtag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template long long int TaggedSendRecv -( long long int sb, int to, int stag, int from, int rtag, Comm comm ); -template unsigned long long TaggedSendRecv -( unsigned long long sb, int to, int stag, int from, int rtag, Comm comm ); +(long long int sb, int to, int stag, int from, + int rtag, Comm comm); +template unsigned long long TaggedSendRecv (unsigned long + long sb, + int to, + int stag, + int from, + int rtag, + Comm comm); #endif template float TaggedSendRecv -( float sb, int to, int stag, int from, int rtag, Comm comm ); -template double TaggedSendRecv -( double sb, int to, int stag, int from, int rtag, Comm comm ); -template Complex TaggedSendRecv -( Complex sb, int to, int stag, int from, int rtag, Comm comm ); -template Complex TaggedSendRecv -( Complex sb, int to, int stag, int from, int rtag, Comm comm ); - -template -T SendRecv( T sb, int to, int from, Comm comm ) -{ return TaggedSendRecv( sb, to, 0, from, mpi::ANY_TAG, comm ); } +(float sb, int to, int stag, int from, int rtag, + Comm comm); +template double TaggedSendRecv (double sb, int to, + int stag, int from, + int rtag, Comm comm); +template Complex < float >TaggedSendRecv (Complex < + float >sb, + int to, + int stag, + int from, + int rtag, + Comm comm); +template Complex < double >TaggedSendRecv (Complex < + double >sb, + int to, + int stag, + int from, + int rtag, + Comm comm); + +template < typename T > +T SendRecv (T sb, int to, int from, Comm comm) +{ + return TaggedSendRecv (sb, to, 0, from, mpi::ANY_TAG, + comm); +} -template byte SendRecv( byte sb, int to, int from, Comm comm ); -template int SendRecv( int sb, int to, int from, Comm comm ); -template unsigned SendRecv( unsigned sb, int to, int from, Comm comm ); -template long int SendRecv( long int sb, int to, int from, Comm comm ); -template unsigned long SendRecv( unsigned long sb, int to, int from, Comm comm ); +template byte SendRecv (byte sb, int to, int from, + Comm comm); +template int SendRecv (int sb, int to, int from, + Comm comm); +template unsigned SendRecv (unsigned sb, int to, int from, + Comm comm); +template long int SendRecv (long int sb, int to, int from, + Comm comm); +template unsigned long SendRecv (unsigned long sb, int to, + int from, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int SendRecv( long long int sb, int to, int from, Comm comm ); -template unsigned long long SendRecv( unsigned long long sb, int to, int from, Comm comm ); +template long long int SendRecv (long long int sb, int to, + int from, Comm comm); +template unsigned long long SendRecv (unsigned long long + sb, int to, + int from, + Comm comm); #endif -template float SendRecv( float sb, int to, int from, Comm comm ); -template double SendRecv( double sb, int to, int from, Comm comm ); -template Complex SendRecv -( Complex sb, int to, int from, Comm comm ); -template Complex SendRecv -( Complex sb, int to, int from, Comm comm ); - -template +template float SendRecv (float sb, int to, int from, + Comm comm); +template double SendRecv (double sb, int to, int from, + Comm comm); +template Complex < float >SendRecv (Complex < float >sb, + int to, int from, + Comm comm); +template Complex < double >SendRecv (Complex < double >sb, + int to, int from, + Comm comm); + +template < typename R > void TaggedSendRecv -( R* buf, int count, int to, int stag, int from, int rtag, Comm comm ) +(R * buf, int count, int to, int stag, int from, + int rtag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::SendRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::SendRecv")) Status status; - SafeMpi - ( MPI_Sendrecv_replace - ( buf, count, TypeMap(), to, stag, from, rtag, comm.comm, &status ) ); + + SafeMpi (MPI_Sendrecv_replace + (buf, count, TypeMap < R > (), to, stag, + from, rtag, comm.comm, &status)); } -template +template < typename R > void TaggedSendRecv -( Complex* buf, int count, int to, int stag, int from, int rtag, Comm comm ) +(Complex < R > *buf, int count, int to, int stag, + int from, int rtag, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::SendRecv")) + DEBUG_ONLY (CallStackEntry cse ("mpi::SendRecv")) Status status; + #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Sendrecv_replace - ( buf, 2*count, TypeMap(), to, stag, from, rtag, comm.comm, - &status ) ); + (MPI_Sendrecv_replace + (buf, 2 * count, TypeMap < R > (), to, stag, + from, rtag, comm.comm, &status)); #else SafeMpi - ( MPI_Sendrecv_replace - ( buf, count, TypeMap>(), - to, stag, from, rtag, comm.comm, &status ) ); + (MPI_Sendrecv_replace + (buf, count, TypeMap < Complex < R >> (), + to, stag, from, rtag, comm.comm, &status)); #endif } template void TaggedSendRecv -( byte* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( int* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( unsigned* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( long int* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( unsigned long* buf, int count, int to, int stag, int from, int rtag, Comm comm ); +(byte * buf, int count, int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (int *buf, int count, int to, + int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (unsigned *buf, int count, + int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (long int *buf, int count, + int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (unsigned long *buf, + int count, int to, int stag, + int from, int rtag, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void TaggedSendRecv -( long long int* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( unsigned long long* buf, int count, int to, int stag, int from, int rtag, Comm comm ); +(long long int *buf, int count, int to, int stag, + int from, int rtag, Comm comm); +template void TaggedSendRecv (unsigned long long *buf, + int count, int to, int stag, + int from, int rtag, + Comm comm); #endif template void TaggedSendRecv -( float* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( double* buf, int count, int to, int stag, int from, int rtag, Comm comm ); -template void TaggedSendRecv -( Complex* buf, int count, int to, int stag, - int from, int rtag, Comm comm ); -template void TaggedSendRecv -( Complex* buf, int count, int to, int stag, - int from, int rtag, Comm comm ); - -template -void SendRecv( T* buf, int count, int to, int from, Comm comm ) -{ TaggedSendRecv( buf, count, to, 0, from, mpi::ANY_TAG, comm ); } +(float *buf, int count, int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (double *buf, int count, + int to, int stag, int from, + int rtag, Comm comm); +template void TaggedSendRecv (Complex < float >*buf, + int count, int to, int stag, + int from, int rtag, + Comm comm); +template void TaggedSendRecv (Complex < double >*buf, + int count, int to, int stag, + int from, int rtag, + Comm comm); + +template < typename T > +void SendRecv (T * buf, int count, int to, int from, + Comm comm) +{ + TaggedSendRecv (buf, count, to, 0, from, mpi::ANY_TAG, + comm); +} template void SendRecv -( byte* buf, int count, int to, int from, Comm comm ); -template void SendRecv -( int* buf, int count, int to, int from, Comm comm ); -template void SendRecv -( unsigned* buf, int count, int to, int from, Comm comm ); +(byte * buf, int count, int to, int from, Comm comm); template void SendRecv -( long int* buf, int count, int to, int from, Comm comm ); +(int *buf, int count, int to, int from, Comm comm); template void SendRecv -( unsigned long* buf, int count, int to, int from, Comm comm ); +(unsigned *buf, int count, int to, int from, + Comm comm); +template void SendRecv (long int *buf, int count, int to, + int from, Comm comm); +template void SendRecv (unsigned long *buf, int count, + int to, int from, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void SendRecv -( long long int* buf, int count, int to, int from, Comm comm ); -template void SendRecv -( unsigned long long* buf, int count, int to, int from, Comm comm ); +(long long int *buf, int count, int to, int from, + Comm comm); +template void SendRecv (unsigned long long *buf, + int count, int to, int from, + Comm comm); #endif template void SendRecv -( float* buf, int count, int to, int from, Comm comm ); -template void SendRecv -( double* buf, int count, int to, int from, Comm comm ); +(float *buf, int count, int to, int from, Comm comm); template void SendRecv -( Complex* buf, int count, int to, int from, Comm comm ); +(double *buf, int count, int to, int from, Comm comm); template void SendRecv -( Complex* buf, int count, int to, int from, Comm comm ); - -template -void Broadcast( R* buf, int count, int root, Comm comm ) +(Complex < float >*buf, int count, int to, int from, + Comm comm); +template void SendRecv (Complex < double >*buf, int count, + int to, int from, Comm comm); + +template < typename R > +void Broadcast (R * buf, int count, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Broadcast")) - SafeMpi( MPI_Bcast( buf, count, TypeMap(), root, comm.comm ) ); + DEBUG_ONLY (CallStackEntry cse ("mpi::Broadcast")) + SafeMpi (MPI_Bcast + (buf, count, TypeMap < R > (), root, + comm.comm)); } -template -void Broadcast( Complex* buf, int count, int root, Comm comm ) +template < typename R > +void Broadcast (Complex < R > *buf, int count, + int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Broadcast")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Broadcast")) #ifdef EL_AVOID_COMPLEX_MPI - SafeMpi( MPI_Bcast( buf, 2*count, TypeMap(), root, comm.comm ) ); + SafeMpi (MPI_Bcast + (buf, 2 * count, TypeMap < R > (), root, + comm.comm)); #else - SafeMpi( MPI_Bcast( buf, count, TypeMap>(), root, comm.comm ) ); + SafeMpi (MPI_Bcast + (buf, count, TypeMap < Complex < R >> (), + root, comm.comm)); #endif } -template void Broadcast( byte* buf, int count, int root, Comm comm ); -template void Broadcast( int* buf, int count, int root, Comm comm ); -template void Broadcast( unsigned* buf, int count, int root, Comm comm ); -template void Broadcast( long int* buf, int count, int root, Comm comm ); -template void Broadcast( unsigned long* buf, int count, int root, Comm comm ); +template void Broadcast (byte * buf, int count, int root, + Comm comm); +template void Broadcast (int *buf, int count, int root, + Comm comm); +template void Broadcast (unsigned *buf, int count, + int root, Comm comm); +template void Broadcast (long int *buf, int count, + int root, Comm comm); +template void Broadcast (unsigned long *buf, int count, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Broadcast( long long int* buf, int count, int root, Comm comm ); -template void Broadcast( unsigned long long* buf, int count, int root, Comm comm ); +template void Broadcast (long long int *buf, int count, + int root, Comm comm); +template void Broadcast (unsigned long long *buf, + int count, int root, Comm comm); #endif -template void Broadcast( float* buf, int count, int root, Comm comm ); -template void Broadcast( double* buf, int count, int root, Comm comm ); -template void Broadcast( Complex* buf, int count, int root, Comm comm ); -template void Broadcast( Complex* buf, int count, int root, Comm comm ); - -template -void Broadcast( T& b, int root, Comm comm ) -{ Broadcast( &b, 1, root, comm ); } +template void Broadcast (float *buf, int count, int root, + Comm comm); +template void Broadcast (double *buf, int count, int root, + Comm comm); +template void Broadcast (Complex < float >*buf, int count, + int root, Comm comm); +template void Broadcast (Complex < double >*buf, + int count, int root, Comm comm); + +template < typename T > void Broadcast (T & b, int root, + Comm comm) +{ + Broadcast (&b, 1, root, comm); +} -template void Broadcast( byte& b, int root, Comm comm ); -template void Broadcast( int& b, int root, Comm comm ); -template void Broadcast( unsigned& b, int root, Comm comm ); -template void Broadcast( long int& b, int root, Comm comm ); -template void Broadcast( unsigned long& b, int root, Comm comm ); +template void Broadcast (byte & b, int root, Comm comm); +template void Broadcast (int &b, int root, Comm comm); +template void Broadcast (unsigned &b, int root, + Comm comm); +template void Broadcast (long int &b, int root, + Comm comm); +template void Broadcast (unsigned long &b, int root, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Broadcast( long long int& b, int root, Comm comm ); -template void Broadcast( unsigned long long& b, int root, Comm comm ); +template void Broadcast (long long int &b, int root, + Comm comm); +template void Broadcast (unsigned long long &b, int root, + Comm comm); #endif -template void Broadcast( float& b, int root, Comm comm ); -template void Broadcast( double& b, int root, Comm comm ); -template void Broadcast( Complex& b, int root, Comm comm ); -template void Broadcast( Complex& b, int root, Comm comm ); +template void Broadcast (float &b, int root, Comm comm); +template void Broadcast (double &b, int root, Comm comm); +template void Broadcast (Complex < float >&b, int root, + Comm comm); +template void Broadcast (Complex < double >&b, int root, + Comm comm); #ifdef EL_HAVE_NONBLOCKING_COLLECTIVES -template -void IBroadcast( R* buf, int count, int root, Comm comm, Request& request ) +template < typename R > +void IBroadcast (R * buf, int count, int root, + Comm comm, Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IBroadcast")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IBroadcast")) SafeMpi - ( MPI_Ibcast( buf, count, TypeMap(), root, comm.comm, &request ) ); + (MPI_Ibcast + (buf, count, TypeMap < R > (), root, comm.comm, + &request)); } -template +template < typename R > void IBroadcast -( Complex* buf, int count, int root, Comm comm, Request& request ) +(Complex < R > *buf, int count, int root, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IBroadcast")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IBroadcast")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Ibcast( buf, 2*count, TypeMap(), root, comm.comm, &request ) ); + (MPI_Ibcast + (buf, 2 * count, TypeMap < R > (), root, + comm.comm, &request)); #else SafeMpi - ( MPI_Ibcast - ( buf, count, TypeMap>(), root, comm.comm, &request ) ); + (MPI_Ibcast + (buf, count, TypeMap < Complex < R >> (), root, + comm.comm, &request)); #endif } -template void IBroadcast( byte* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( int* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( long int* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned long* buf, int count, int root, Comm comm, Request& request ); +template void IBroadcast (byte * buf, int count, int root, + Comm comm, Request & request); +template void IBroadcast (int *buf, int count, int root, + Comm comm, Request & request); +template void IBroadcast (unsigned *buf, int count, + int root, Comm comm, + Request & request); +template void IBroadcast (long int *buf, int count, + int root, Comm comm, + Request & request); +template void IBroadcast (unsigned long *buf, int count, + int root, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG -template void IBroadcast( long long int* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned long long* buf, int count, int root, Comm comm, Request& request ); +template void IBroadcast (long long int *buf, int count, + int root, Comm comm, + Request & request); +template void IBroadcast (unsigned long long *buf, + int count, int root, Comm comm, + Request & request); #endif -template void IBroadcast( float* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( double* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( Complex* buf, int count, int root, Comm comm, Request& request ); -template void IBroadcast( Complex* buf, int count, int root, Comm comm, Request& request ); +template void IBroadcast (float *buf, int count, int root, + Comm comm, Request & request); +template void IBroadcast (double *buf, int count, + int root, Comm comm, + Request & request); +template void IBroadcast (Complex < float >*buf, + int count, int root, Comm comm, + Request & request); +template void IBroadcast (Complex < double >*buf, + int count, int root, Comm comm, + Request & request); + +template < typename T > +void IBroadcast (T & b, int root, Comm comm, + Request & request) +{ + IBroadcast (&b, 1, root, comm, request); +} -template -void IBroadcast( T& b, int root, Comm comm, Request& request ) -{ IBroadcast( &b, 1, root, comm, request ); } - -template void IBroadcast( byte& b, int root, Comm comm, Request& request ); -template void IBroadcast( int& b, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned& b, int root, Comm comm, Request& request ); -template void IBroadcast( long int& b, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned long& b, int root, Comm comm, Request& request ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void IBroadcast( long long int& b, int root, Comm comm, Request& request ); -template void IBroadcast( unsigned long long& b, int root, Comm comm, Request& request ); -#endif -template void IBroadcast( float& b, int root, Comm comm, Request& request ); -template void IBroadcast( double& b, int root, Comm comm, Request& request ); -template void IBroadcast( Complex& b, int root, Comm comm, Request& request ); -template void IBroadcast( Complex& b, int root, Comm comm, Request& request ); +template void IBroadcast (byte & b, int root, Comm comm, + Request & request); +template void IBroadcast (int &b, int root, Comm comm, + Request & request); +template void IBroadcast (unsigned &b, int root, + Comm comm, Request & request); +template void IBroadcast (long int &b, int root, + Comm comm, Request & request); +template void IBroadcast (unsigned long &b, int root, + Comm comm, Request & request); +#ifdef EL_HAVE_MPI_LONG_LONG +template void IBroadcast (long long int &b, int root, + Comm comm, Request & request); +template void IBroadcast (unsigned long long &b, int root, + Comm comm, Request & request); +#endif +template void IBroadcast (float &b, int root, Comm comm, + Request & request); +template void IBroadcast (double &b, int root, Comm comm, + Request & request); +template void IBroadcast (Complex < float >&b, int root, + Comm comm, Request & request); +template void IBroadcast (Complex < double >&b, int root, + Comm comm, Request & request); #endif // ifdef EL_HAVE_NONBLOCKING_COLLECTIVES -template +template < typename R > void Gather -( const R* sbuf, int sc, - R* rbuf, int rc, int root, Comm comm ) +(const R * sbuf, int sc, R * rbuf, int rc, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Gather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Gather")) SafeMpi - ( MPI_Gather - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), root, comm.comm ) ); + (MPI_Gather + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), root, comm.comm)); } -template +template < typename R > void Gather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Gather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Gather")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Gather - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), root, comm.comm ) ); + (MPI_Gather + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), root, comm.comm)); #else SafeMpi - ( MPI_Gather - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), - root, comm.comm ) ); + (MPI_Gather + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), root, comm.comm)); #endif } -template void Gather( const byte* sbuf, int sc, byte* rbuf, int rc, int root, Comm comm ); -template void Gather( const int* sbuf, int sc, int* rbuf, int rc, int root, Comm comm ); -template void Gather( const unsigned* sbuf, int sc, unsigned* rbuf, int rc, int root, Comm comm ); -template void Gather( const long int* sbuf, int sc, long int* rbuf, int rc, int root, Comm comm ); -template void Gather( const unsigned long* sbuf, int sc, unsigned long* rbuf, int rc, int root, Comm comm ); +template void Gather (const byte * sbuf, int sc, + byte * rbuf, int rc, int root, + Comm comm); +template void Gather (const int *sbuf, int sc, int *rbuf, + int rc, int root, Comm comm); +template void Gather (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, int root, + Comm comm); +template void Gather (const long int *sbuf, int sc, + long int *rbuf, int rc, int root, + Comm comm); +template void Gather (const unsigned long *sbuf, int sc, + unsigned long *rbuf, int rc, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Gather( const long long int* sbuf, int sc, long long int* rbuf, int rc, int root, Comm comm ); -template void Gather( const unsigned long long* sbuf, int sc, unsigned long long* rbuf, int rc, int root, Comm comm ); +template void Gather (const long long int *sbuf, int sc, + long long int *rbuf, int rc, + int root, Comm comm); +template void Gather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + int rc, int root, Comm comm); #endif -template void Gather( const float* sbuf, int sc, float* rbuf, int rc, int root, Comm comm ); -template void Gather( const double* sbuf, int sc, double* rbuf, int rc, int root, Comm comm ); -template void Gather( const Complex* sbuf, int sc, Complex* rbuf, int rc, int root, Comm comm ); -template void Gather( const Complex* sbuf, int sc, Complex* rbuf, int rc, int root, Comm comm ); +template void Gather (const float *sbuf, int sc, + float *rbuf, int rc, int root, + Comm comm); +template void Gather (const double *sbuf, int sc, + double *rbuf, int rc, int root, + Comm comm); +template void Gather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + int rc, int root, Comm comm); +template void Gather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + int rc, int root, Comm comm); #ifdef EL_HAVE_NONBLOCKING_COLLECTIVES -template +template < typename R > void IGather -( const R* sbuf, int sc, - R* rbuf, int rc, int root, Comm comm, Request& request ) +(const R * sbuf, int sc, + R * rbuf, int rc, int root, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IGather")) SafeMpi - ( MPI_Igather - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), root, comm.comm, &request ) ); + (MPI_Igather + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), root, comm.comm, + &request)); } -template +template < typename R > void IGather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm, Request& request ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, int root, Comm comm, + Request & request) { - DEBUG_ONLY(CallStackEntry cse("mpi::IGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::IGather")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Igather - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), - root, comm.comm, &request ) ); + (MPI_Igather + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), root, comm.comm, &request)); #else SafeMpi - ( MPI_Igather - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), - root, comm.comm, &request ) ); + (MPI_Igather + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), root, comm.comm, + &request)); #endif } template void IGather -( const byte* sbuf, int sc, - byte* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const int* sbuf, int sc, - int* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const unsigned* sbuf, int sc, - unsigned* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const long int* sbuf, int sc, - long int* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, int rc, int root, Comm comm, Request& request ); +(const byte * sbuf, int sc, + byte * rbuf, int rc, int root, Comm comm, + Request & request); +template void IGather (const int *sbuf, int sc, int *rbuf, + int rc, int root, Comm comm, + Request & request); +template void IGather (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, int root, + Comm comm, Request & request); +template void IGather (const long int *sbuf, int sc, + long int *rbuf, int rc, int root, + Comm comm, Request & request); +template void IGather (const unsigned long *sbuf, int sc, + unsigned long *rbuf, int rc, + int root, Comm comm, + Request & request); #ifdef EL_HAVE_MPI_LONG_LONG template void IGather -( const long long int* sbuf, int sc, - long long int* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, int rc, int root, Comm comm, Request& request ); +(const long long int *sbuf, int sc, + long long int *rbuf, int rc, int root, Comm comm, + Request & request); +template void IGather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + int rc, int root, Comm comm, + Request & request); #endif template void IGather -( const float* sbuf, int sc, - float* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const double* sbuf, int sc, - double* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm, Request& request ); -template void IGather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm, Request& request ); +(const float *sbuf, int sc, + float *rbuf, int rc, int root, Comm comm, + Request & request); +template void IGather (const double *sbuf, int sc, + double *rbuf, int rc, int root, + Comm comm, Request & request); +template void IGather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + int rc, int root, Comm comm, + Request & request); +template void IGather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + int rc, int root, Comm comm, + Request & request); #endif // ifdef EL_HAVE_NONBLOCKING_COLLECTIVES -template +template < typename R > void Gather -( const R* sbuf, int sc, - R* rbuf, const int* rcs, const int* rds, int root, Comm comm ) +(const R * sbuf, int sc, + R * rbuf, const int *rcs, const int *rds, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Gather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Gather")) SafeMpi - ( MPI_Gatherv - ( const_cast(sbuf), - sc, - TypeMap(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap(), - root, - comm.comm ) ); + (MPI_Gatherv + (const_cast < R * >(sbuf), + sc, + TypeMap < R > (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), TypeMap < R > (), + root, comm.comm)); } -template +template < typename R > void Gather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, int root, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, const int *rcs, const int *rds, + int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Gather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Gather")) #ifdef EL_AVOID_COMPLEX_MPI const int commRank = Rank( comm ); const int commSize = Size( comm ); vector rcsDouble, rdsDouble; if( commRank == root ) { - rcsDouble.resize( commSize ); - rdsDouble.resize( commSize ); - for( int i=0; i*>(sbuf), 2*sc, TypeMap(), - rbuf, rcsDouble.data(), rdsDouble.data(), TypeMap(), - root, comm.comm ) ); + (MPI_Gatherv + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, rcsDouble.data (), + rdsDouble.data (), TypeMap < R > (), root, + comm.comm)); #else SafeMpi - ( MPI_Gatherv - ( const_cast*>(sbuf), - sc, - TypeMap>(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap>(), - root, - comm.comm ) ); + (MPI_Gatherv + (const_cast < Complex < R > *>(sbuf), + sc, + TypeMap < Complex < R >> (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), + TypeMap < Complex < R >> (), root, + comm.comm)); #endif } template void Gather -( const byte* sbuf, int sc, - byte* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const int* sbuf, int sc, - int* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const unsigned* sbuf, int sc, - unsigned* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const long int* sbuf, int sc, - long int* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, const int* rcs, const int* rds, int root, Comm comm ); +(const byte * sbuf, int sc, + byte * rbuf, const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const int *sbuf, int sc, int *rbuf, + const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const unsigned *sbuf, int sc, + unsigned *rbuf, const int *rcs, + const int *rds, int root, + Comm comm); +template void Gather (const long int *sbuf, int sc, + long int *rbuf, const int *rcs, + const int *rds, int root, + Comm comm); +template void Gather (const unsigned long *sbuf, int sc, + unsigned long *rbuf, const int *rcs, + const int *rds, int root, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void Gather -( const long long int* sbuf, int sc, - long long int* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, const int* rcs, const int* rds, int root, Comm comm ); +(const long long int *sbuf, int sc, + long long int *rbuf, const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + const int *rcs, const int *rds, + int root, Comm comm); #endif template void Gather -( const float* sbuf, int sc, - float* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const double* sbuf, int sc, - double* rbuf, const int* rcs, const int* rds, int root, Comm comm ); -template void Gather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, - int root, Comm comm ); -template void Gather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, - int root, Comm comm ); - -template +(const float *sbuf, int sc, + float *rbuf, const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const double *sbuf, int sc, + double *rbuf, const int *rcs, + const int *rds, int root, + Comm comm); +template void Gather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + const int *rcs, const int *rds, + int root, Comm comm); +template void Gather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + const int *rcs, const int *rds, + int root, Comm comm); + +template < typename R > void AllGather -( const R* sbuf, int sc, - R* rbuf, int rc, Comm comm ) +(const R * sbuf, int sc, R * rbuf, int rc, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllGather")) #ifdef EL_USE_BYTE_ALLGATHERS SafeMpi - ( MPI_Allgather - ( (UCP)const_cast(sbuf), sizeof(R)*sc, MPI_UNSIGNED_CHAR, - (UCP)rbuf, sizeof(R)*rc, MPI_UNSIGNED_CHAR, - comm.comm ) ); + (MPI_Allgather + ((UCP) const_cast < R * >(sbuf), sizeof (R) * sc, + MPI_UNSIGNED_CHAR, (UCP) rbuf, sizeof (R) * rc, + MPI_UNSIGNED_CHAR, comm.comm)); #else SafeMpi - ( MPI_Allgather - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), comm.comm ) ); + (MPI_Allgather + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), comm.comm)); #endif } -template +template < typename R > void AllGather -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllGather")) #ifdef EL_USE_BYTE_ALLGATHERS SafeMpi - ( MPI_Allgather - ( (UCP)const_cast*>(sbuf), 2*sizeof(R)*sc, MPI_UNSIGNED_CHAR, - (UCP)rbuf, 2*sizeof(R)*rc, MPI_UNSIGNED_CHAR, - comm.comm ) ); + (MPI_Allgather + ((UCP) const_cast < Complex < R > *>(sbuf), + 2 * sizeof (R) * sc, MPI_UNSIGNED_CHAR, + (UCP) rbuf, 2 * sizeof (R) * rc, + MPI_UNSIGNED_CHAR, comm.comm)); #else - #ifdef EL_AVOID_COMPLEX_MPI +#ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Allgather - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), comm.comm ) ); - #else + (MPI_Allgather + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), comm.comm)); +#else SafeMpi - ( MPI_Allgather - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), comm.comm ) ); - #endif + (MPI_Allgather + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), comm.comm)); +#endif #endif } -template void AllGather( const byte* sbuf, int sc, byte* rbuf, int rc, Comm comm ); -template void AllGather( const int* sbuf, int sc, int* rbuf, int rc, Comm comm ); -template void AllGather( const unsigned* sbuf, int sc, unsigned* rbuf, int rc, Comm comm ); -template void AllGather( const long int* sbuf, int sc, long int* rbuf, int rc, Comm comm ); -template void AllGather( const unsigned long* sbuf, int sc, unsigned long* rbuf, int rc, Comm comm ); +template void AllGather (const byte * sbuf, int sc, + byte * rbuf, int rc, Comm comm); +template void AllGather (const int *sbuf, int sc, + int *rbuf, int rc, Comm comm); +template void AllGather (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, + Comm comm); +template void AllGather (const long int *sbuf, int sc, + long int *rbuf, int rc, + Comm comm); +template void AllGather (const unsigned long *sbuf, + int sc, unsigned long *rbuf, + int rc, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void AllGather( const long long int* sbuf, int sc, long long int* rbuf, int rc, Comm comm ); -template void AllGather( const unsigned long long* sbuf, int sc, unsigned long long* rbuf, int rc, Comm comm ); +template void AllGather (const long long int *sbuf, + int sc, long long int *rbuf, + int rc, Comm comm); +template void AllGather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + int rc, Comm comm); #endif -template void AllGather( const float* sbuf, int sc, float* rbuf, int rc, Comm comm ); -template void AllGather( const double* sbuf, int sc, double* rbuf, int rc, Comm comm ); -template void AllGather( const Complex* sbuf, int sc, Complex* rbuf, int rc, Comm comm ); -template void AllGather( const Complex* sbuf, int sc, Complex* rbuf, int rc, Comm comm ); - -template +template void AllGather (const float *sbuf, int sc, + float *rbuf, int rc, Comm comm); +template void AllGather (const double *sbuf, int sc, + double *rbuf, int rc, Comm comm); +template void AllGather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + int rc, Comm comm); +template void AllGather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + int rc, Comm comm); + +template < typename R > void AllGather -( const R* sbuf, int sc, - R* rbuf, const int* rcs, const int* rds, Comm comm ) +(const R * sbuf, int sc, + R * rbuf, const int *rcs, const int *rds, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllGather")) #ifdef EL_USE_BYTE_ALLGATHERS const int commSize = Size( comm ); vector byteRcs( commSize ), byteRds( commSize ); for( int i=0; i(sbuf), sizeof(R)*sc, MPI_UNSIGNED_CHAR, - (UCP)rbuf, byteRcs.data(), byteRds.data(), MPI_UNSIGNED_CHAR, - comm.comm ) ); + (MPI_Allgatherv + ((UCP) const_cast < R * >(sbuf), sizeof (R) * sc, + MPI_UNSIGNED_CHAR, (UCP) rbuf, byteRcs.data (), + byteRds.data (), MPI_UNSIGNED_CHAR, comm.comm)); #else SafeMpi - ( MPI_Allgatherv - ( const_cast(sbuf), - sc, - TypeMap(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap(), - comm.comm ) ); + (MPI_Allgatherv + (const_cast < R * >(sbuf), + sc, + TypeMap < R > (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), TypeMap < R > (), + comm.comm)); #endif } -template +template < typename R > void AllGather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, const int *rcs, const int *rds, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllGather")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllGather")) #ifdef EL_USE_BYTE_ALLGATHERS const int commSize = Size( comm ); vector byteRcs( commSize ), byteRds( commSize ); for( int i=0; i*>(sbuf), 2*sizeof(R)*sc, MPI_UNSIGNED_CHAR, - (UCP)rbuf, byteRcs.data(), byteRds.data(), MPI_UNSIGNED_CHAR, - comm.comm ) ); + (MPI_Allgatherv + ((UCP) const_cast < Complex < R > *>(sbuf), + 2 * sizeof (R) * sc, MPI_UNSIGNED_CHAR, + (UCP) rbuf, byteRcs.data (), byteRds.data (), + MPI_UNSIGNED_CHAR, comm.comm)); #else #ifdef EL_AVOID_COMPLEX_MPI const int commSize = Size( comm ); vector realRcs( commSize ), realRds( commSize ); for( int i=0; i*>(sbuf), 2*sc, TypeMap(), - rbuf, realRcs.data(), realRds.data(), TypeMap(), comm.comm ) ); - #else + (MPI_Allgatherv + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, realRcs.data (), + realRds.data (), TypeMap < R > (), comm.comm)); +#else SafeMpi - ( MPI_Allgatherv - ( const_cast*>(sbuf), - sc, - TypeMap>(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap>(), - comm.comm ) ); - #endif + (MPI_Allgatherv + (const_cast < Complex < R > *>(sbuf), + sc, + TypeMap < Complex < R >> (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), + TypeMap < Complex < R >> (), comm.comm)); +#endif #endif } template void AllGather -( const byte* sbuf, int sc, - byte* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const int* sbuf, int sc, - int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const unsigned* sbuf, int sc, - unsigned* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const long int* sbuf, int sc, - long int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, const int* rcs, const int* rds, Comm comm ); +(const byte * sbuf, int sc, + byte * rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllGather (const int *sbuf, int sc, + int *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllGather (const unsigned *sbuf, int sc, + unsigned *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllGather (const long int *sbuf, int sc, + long int *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllGather (const unsigned long *sbuf, + int sc, unsigned long *rbuf, + const int *rcs, const int *rds, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void AllGather -( const long long int* sbuf, int sc, - long long int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, const int* rcs, const int* rds, Comm comm ); +(const long long int *sbuf, int sc, + long long int *rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllGather (const unsigned long long *sbuf, + int sc, unsigned long long *rbuf, + const int *rcs, const int *rds, + Comm comm); #endif template void AllGather -( const float* sbuf, int sc, - float* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const double* sbuf, int sc, - double* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllGather -( const Complex* sbuf, int sc, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ); - -template +(const float *sbuf, int sc, + float *rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllGather (const double *sbuf, int sc, + double *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllGather (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + const int *rcs, const int *rds, + Comm comm); +template void AllGather (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + const int *rcs, const int *rds, + Comm comm); + +template < typename R > void Scatter -( const R* sbuf, int sc, - R* rbuf, int rc, int root, Comm comm ) +(const R * sbuf, int sc, R * rbuf, int rc, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Scatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Scatter")) SafeMpi - ( MPI_Scatter - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), root, comm.comm)); } -template +template < typename R > void Scatter -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Scatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::Scatter")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Scatter - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), root, comm.comm)); #else SafeMpi - ( MPI_Scatter - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), - root, comm.comm ) ); + (MPI_Scatter + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), root, comm.comm)); #endif } template void Scatter -( const byte* sbuf, int sc, - byte* rbuf, int rc, int root, Comm comm ); +(const byte * sbuf, int sc, + byte * rbuf, int rc, int root, Comm comm); template void Scatter -( const int* sbuf, int sc, - int* rbuf, int rc, int root, Comm comm ); -template void Scatter -( const unsigned* sbuf, int sc, - unsigned* rbuf, int rc, int root, Comm comm ); -template void Scatter -( const long int* sbuf, int sc, - long int* rbuf, int rc, int root, Comm comm ); -template void Scatter -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, int rc, int root, Comm comm ); +(const int *sbuf, int sc, int *rbuf, int rc, int root, + Comm comm); +template void Scatter (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, int root, + Comm comm); +template void Scatter (const long int *sbuf, int sc, + long int *rbuf, int rc, int root, + Comm comm); +template void Scatter (const unsigned long *sbuf, int sc, + unsigned long *rbuf, int rc, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void Scatter -( const long long int* sbuf, int sc, - long long int* rbuf, int rc, int root, Comm comm ); +(const long long int *sbuf, int sc, + long long int *rbuf, int rc, int root, Comm comm); template void Scatter -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, int rc, int root, Comm comm ); +(const unsigned long long *sbuf, int sc, + unsigned long long *rbuf, int rc, int root, + Comm comm); #endif template void Scatter -( const float* sbuf, int sc, - float* rbuf, int rc, int root, Comm comm ); +(const float *sbuf, int sc, + float *rbuf, int rc, int root, Comm comm); template void Scatter -( const double* sbuf, int sc, - double* rbuf, int rc, int root, Comm comm ); +(const double *sbuf, int sc, + double *rbuf, int rc, int root, Comm comm); template void Scatter -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm ); +(const Complex < float >*sbuf, int sc, + Complex < float >*rbuf, int rc, int root, Comm comm); template void Scatter -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, int root, Comm comm ); +(const Complex < double >*sbuf, int sc, + Complex < double >*rbuf, int rc, int root, + Comm comm); -template -void Scatter( R* buf, int sc, int rc, int root, Comm comm ) +template < typename R > +void Scatter (R * buf, int sc, int rc, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Scatter")) - const int commRank = Rank( comm ); - if( commRank == root ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Scatter")) + const int commRank = Rank (comm); + + if (commRank == root) { #ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Scatter - ( buf, sc, TypeMap(), - MPI_IN_PLACE, rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (buf, sc, TypeMap < R > (), + MPI_IN_PLACE, rc, TypeMap < R > (), root, + comm.comm)); #else const int commSize = Size( comm ); vector sendBuf( sc*commSize ); MemCopy( sendBuf.data(), buf, sc*commSize ); SafeMpi - ( MPI_Scatter - ( sendBuf.data(), sc, TypeMap(), - buf, rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (sendBuf.data (), sc, TypeMap < R > (), + buf, rc, TypeMap < R > (), root, + comm.comm)); #endif } else { SafeMpi - ( MPI_Scatter - ( 0, sc, TypeMap(), - buf, rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (0, sc, TypeMap < R > (), + buf, rc, TypeMap < R > (), root, + comm.comm)); } } -template -void Scatter( Complex* buf, int sc, int rc, int root, Comm comm ) +template < typename R > +void Scatter (Complex < R > *buf, int sc, int rc, + int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Scatter")) - const int commRank = Rank( comm ); - if( commRank == root ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Scatter")) + const int commRank = Rank (comm); + + if (commRank == root) { #ifdef EL_AVOID_COMPLEX_MPI -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi ( MPI_Scatter ( buf, 2*sc, TypeMap(), @@ -1763,12 +3580,13 @@ void Scatter( Complex* buf, int sc, int rc, int root, Comm comm ) vector> sendBuf( sc*commSize ); MemCopy( sendBuf.data(), buf, sc*commSize ); SafeMpi - ( MPI_Scatter - ( sendBuf.data(), 2*sc, TypeMap(), - buf, 2*rc, TypeMap(), root, comm.comm ) ); -# endif + (MPI_Scatter + (sendBuf.data (), 2 * sc, TypeMap < R > (), + buf, 2 * rc, TypeMap < R > (), root, + comm.comm)); +#endif #else -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi ( MPI_Scatter ( buf, sc, TypeMap>(), @@ -1778,134 +3596,148 @@ void Scatter( Complex* buf, int sc, int rc, int root, Comm comm ) vector> sendBuf( sc*commSize ); MemCopy( sendBuf.data(), buf, sc*commSize ); SafeMpi - ( MPI_Scatter - ( sendBuf.data(), sc, TypeMap>(), - buf, rc, TypeMap>(), root, comm.comm ) ); -# endif + (MPI_Scatter + (sendBuf.data (), sc, + TypeMap < Complex < R >> (), buf, rc, + TypeMap < Complex < R >> (), root, + comm.comm)); +#endif #endif } else { #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Scatter - ( 0, 2*sc, TypeMap(), - buf, 2*rc, TypeMap(), root, comm.comm ) ); + (MPI_Scatter + (0, 2 * sc, TypeMap < R > (), + buf, 2 * rc, TypeMap < R > (), root, + comm.comm)); #else SafeMpi - ( MPI_Scatter - ( 0, sc, TypeMap>(), - buf, rc, TypeMap>(), root, comm.comm ) ); + (MPI_Scatter + (0, sc, TypeMap < Complex < R >> (), + buf, rc, TypeMap < Complex < R >> (), + root, comm.comm)); #endif } } -template void Scatter( byte* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( int* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( unsigned* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( long int* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( unsigned long* buf, int sc, int rc, int root, Comm comm ); +template void Scatter (byte * buf, int sc, int rc, + int root, Comm comm); +template void Scatter (int *buf, int sc, int rc, int root, + Comm comm); +template void Scatter (unsigned *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (long int *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (unsigned long *buf, int sc, int rc, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Scatter( long long int* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( unsigned long long* buf, int sc, int rc, int root, Comm comm ); +template void Scatter (long long int *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (unsigned long long *buf, int sc, + int rc, int root, Comm comm); #endif -template void Scatter( float* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( double* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( Complex* buf, int sc, int rc, int root, Comm comm ); -template void Scatter( Complex* buf, int sc, int rc, int root, Comm comm ); - -template +template void Scatter (float *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (double *buf, int sc, int rc, + int root, Comm comm); +template void Scatter (Complex < float >*buf, int sc, + int rc, int root, Comm comm); +template void Scatter (Complex < double >*buf, int sc, + int rc, int root, Comm comm); + +template < typename R > void AllToAll -( const R* sbuf, int sc, - R* rbuf, int rc, Comm comm ) +(const R * sbuf, int sc, R * rbuf, int rc, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllToAll")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllToAll")) SafeMpi - ( MPI_Alltoall - ( const_cast(sbuf), sc, TypeMap(), - rbuf, rc, TypeMap(), comm.comm ) ); + (MPI_Alltoall + (const_cast < R * >(sbuf), sc, TypeMap < R > (), + rbuf, rc, TypeMap < R > (), comm.comm)); } -template +template < typename R > void AllToAll -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, Comm comm ) +(const Complex < R > *sbuf, int sc, + Complex < R > *rbuf, int rc, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllToAll")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllToAll")) #ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Alltoall - ( const_cast*>(sbuf), 2*sc, TypeMap(), - rbuf, 2*rc, TypeMap(), comm.comm ) ); + (MPI_Alltoall + (const_cast < Complex < R > *>(sbuf), 2 * sc, + TypeMap < R > (), rbuf, 2 * rc, + TypeMap < R > (), comm.comm)); #else SafeMpi - ( MPI_Alltoall - ( const_cast*>(sbuf), sc, TypeMap>(), - rbuf, rc, TypeMap>(), comm.comm ) ); + (MPI_Alltoall + (const_cast < Complex < R > *>(sbuf), sc, + TypeMap < Complex < R >> (), rbuf, rc, + TypeMap < Complex < R >> (), comm.comm)); #endif } template void AllToAll -( const byte* sbuf, int sc, - byte* rbuf, int rc, Comm comm ); -template void AllToAll -( const int* sbuf, int sc, - int* rbuf, int rc, Comm comm ); -template void AllToAll -( const unsigned* sbuf, int sc, - unsigned* rbuf, int rc, Comm comm ); -template void AllToAll -( const long int* sbuf, int sc, - long int* rbuf, int rc, Comm comm ); -template void AllToAll -( const unsigned long* sbuf, int sc, - unsigned long* rbuf, int rc, Comm comm ); +(const byte * sbuf, int sc, byte * rbuf, int rc, + Comm comm); +template void AllToAll (const int *sbuf, int sc, + int *rbuf, int rc, Comm comm); +template void AllToAll (const unsigned *sbuf, int sc, + unsigned *rbuf, int rc, + Comm comm); +template void AllToAll (const long int *sbuf, int sc, + long int *rbuf, int rc, + Comm comm); +template void AllToAll (const unsigned long *sbuf, int sc, + unsigned long *rbuf, int rc, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void AllToAll -( const long long int* sbuf, int sc, - long long int* rbuf, int rc, Comm comm ); +(const long long int *sbuf, int sc, + long long int *rbuf, int rc, Comm comm); template void AllToAll -( const unsigned long long* sbuf, int sc, - unsigned long long* rbuf, int rc, Comm comm ); +(const unsigned long long *sbuf, int sc, + unsigned long long *rbuf, int rc, Comm comm); #endif template void AllToAll -( const float* sbuf, int sc, - float* rbuf, int rc, Comm comm ); -template void AllToAll -( const double* sbuf, int sc, - double* rbuf, int rc, Comm comm ); -template void AllToAll -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, Comm comm ); -template void AllToAll -( const Complex* sbuf, int sc, - Complex* rbuf, int rc, Comm comm ); - -template +(const float *sbuf, int sc, float *rbuf, int rc, + Comm comm); +template void AllToAll (const double *sbuf, int sc, + double *rbuf, int rc, Comm comm); +template void AllToAll (const Complex < float >*sbuf, + int sc, Complex < float >*rbuf, + int rc, Comm comm); +template void AllToAll (const Complex < double >*sbuf, + int sc, Complex < double >*rbuf, + int rc, Comm comm); + +template < typename R > void AllToAll -( const R* sbuf, const int* scs, const int* sds, - R* rbuf, const int* rcs, const int* rds, Comm comm ) +(const R * sbuf, const int *scs, const int *sds, + R * rbuf, const int *rcs, const int *rds, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllToAll")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllToAll")) SafeMpi - ( MPI_Alltoallv - ( const_cast(sbuf), - const_cast(scs), - const_cast(sds), - TypeMap(), - rbuf, - const_cast(rcs), - const_cast(rds), - TypeMap(), - comm.comm ) ); + (MPI_Alltoallv + (const_cast < R * >(sbuf), + const_cast < int *>(scs), + const_cast < int *>(sds), + TypeMap < R > (), + rbuf, + const_cast < int *>(rcs), + const_cast < int *>(rds), TypeMap < R > (), + comm.comm)); } -template +template < typename R > void AllToAll -( const Complex* sbuf, const int* scs, const int* sds, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ) +(const Complex < R > *sbuf, const int *scs, + const int *sds, Complex < R > *rbuf, const int *rcs, + const int *rds, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllToAll")) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllToAll")) #ifdef EL_AVOID_COMPLEX_MPI int p; MPI_Comm_size( comm.comm, &p ); @@ -1942,228 +3774,373 @@ void AllToAll } template void AllToAll -( const byte* sbuf, const int* scs, const int* sds, - byte* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const int* sbuf, const int* scs, const int* sds, - int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const unsigned* sbuf, const int* scs, const int* sds, - unsigned* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const long int* sbuf, const int* scs, const int* sds, - long int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const unsigned long* sbuf, const int* scs, const int* sds, - unsigned long* rbuf, const int* rcs, const int* rds, Comm comm ); +(const byte * sbuf, const int *scs, const int *sds, + byte * rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllToAll (const int *sbuf, const int *scs, + const int *sds, int *rbuf, + const int *rcs, const int *rds, + Comm comm); +template void AllToAll (const unsigned *sbuf, + const int *scs, const int *sds, + unsigned *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllToAll (const long int *sbuf, + const int *scs, const int *sds, + long int *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllToAll (const unsigned long *sbuf, + const int *scs, const int *sds, + unsigned long *rbuf, + const int *rcs, const int *rds, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG template void AllToAll -( const long long int* sbuf, const int* scs, const int* sds, - long long int* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const unsigned long long* sbuf, const int* scs, const int* sds, - unsigned long long* rbuf, const int* rcs, const int* rds, Comm comm ); +(const long long int *sbuf, const int *scs, + const int *sds, long long int *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllToAll (const unsigned long long *sbuf, + const int *scs, const int *sds, + unsigned long long *rbuf, + const int *rcs, const int *rds, + Comm comm); #endif template void AllToAll -( const float* sbuf, const int* scs, const int* sds, - float* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const double* sbuf, const int* scs, const int* sds, - double* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const Complex* sbuf, const int* scs, const int* sds, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ); -template void AllToAll -( const Complex* sbuf, const int* scs, const int* sds, - Complex* rbuf, const int* rcs, const int* rds, Comm comm ); - -template +(const float *sbuf, const int *scs, const int *sds, + float *rbuf, const int *rcs, const int *rds, + Comm comm); +template void AllToAll (const double *sbuf, + const int *scs, const int *sds, + double *rbuf, const int *rcs, + const int *rds, Comm comm); +template void AllToAll (const Complex < float >*sbuf, + const int *scs, const int *sds, + Complex < float >*rbuf, + const int *rcs, const int *rds, + Comm comm); +template void AllToAll (const Complex < double >*sbuf, + const int *scs, const int *sds, + Complex < double >*rbuf, + const int *rcs, const int *rds, + Comm comm); + +template < typename T > void Reduce -( const T* sbuf, T* rbuf, int count, Op op, int root, Comm comm ) +(const T * sbuf, T * rbuf, int count, Op op, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Reduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Reduce")) + if (count != 0) { - SafeMpi - ( MPI_Reduce - ( const_cast(sbuf), rbuf, count, TypeMap(), - op.op, root, comm.comm ) ); + SafeMpi (MPI_Reduce + (const_cast < T * >(sbuf), rbuf, count, + TypeMap < T > (), op.op, root, + comm.comm)); } } -template +template < typename R > void Reduce -( const Complex* sbuf, - Complex* rbuf, int count, Op op, int root, Comm comm ) +(const Complex < R > *sbuf, + Complex < R > *rbuf, int count, Op op, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Reduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Reduce")) + if (count != 0) { #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { SafeMpi - ( MPI_Reduce - ( const_cast*>(sbuf), - rbuf, 2*count, TypeMap(), op.op, root, comm.comm ) ); + (MPI_Reduce + (const_cast < Complex < R > *>(sbuf), + rbuf, 2 * count, TypeMap < R > (), + op.op, root, comm.comm)); } else { SafeMpi - ( MPI_Reduce - ( const_cast*>(sbuf), - rbuf, count, TypeMap>(), op.op, root, comm.comm ) ); + (MPI_Reduce + (const_cast < Complex < R > *>(sbuf), + rbuf, count, + TypeMap < Complex < R >> (), op.op, + root, comm.comm)); } #else SafeMpi - ( MPI_Reduce - ( const_cast*>(sbuf), - rbuf, count, TypeMap>(), op.op, root, comm.comm ) ); + (MPI_Reduce + (const_cast < Complex < R > *>(sbuf), + rbuf, count, TypeMap < Complex < R >> (), + op.op, root, comm.comm)); #endif } } -template void Reduce( const byte* sbuf, byte* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const int* sbuf, int* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const unsigned* sbuf, unsigned* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const long int* sbuf, long int* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const unsigned long* sbuf, unsigned long* rbuf, int count, Op op, int root, Comm comm ); +template void Reduce (const byte * sbuf, byte * rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const int *sbuf, int *rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const unsigned *sbuf, + unsigned *rbuf, int count, Op op, + int root, Comm comm); +template void Reduce (const long int *sbuf, + long int *rbuf, int count, Op op, + int root, Comm comm); +template void Reduce (const unsigned long *sbuf, + unsigned long *rbuf, int count, + Op op, int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Reduce( const long long int* sbuf, long long int* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const unsigned long long* sbuf, unsigned long long* rbuf, int count, Op op, int root, Comm comm ); +template void Reduce (const long long int *sbuf, + long long int *rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const unsigned long long *sbuf, + unsigned long long *rbuf, int count, + Op op, int root, Comm comm); #endif -template void Reduce( const float* sbuf, float* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const double* sbuf, double* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const Complex* sbuf, Complex* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const Complex* sbuf, Complex* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, int root, Comm comm ); - -template -void Reduce( const T* sbuf, T* rbuf, int count, int root, Comm comm ) -{ Reduce( sbuf, rbuf, count, mpi::SUM, root, comm ); } - -template void Reduce( const byte* sbuf, byte* rbuf, int count, int root, Comm comm ); -template void Reduce( const int* sbuf, int* rbuf, int count, int root, Comm comm ); -template void Reduce( const unsigned* sbuf, unsigned* rbuf, int count, int root, Comm comm ); -template void Reduce( const long int* sbuf, long int* rbuf, int count, int root, Comm comm ); -template void Reduce( const unsigned long* sbuf, unsigned long* rbuf, int count, int root, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void Reduce( const long long int* sbuf, long long int* rbuf, int count, int root, Comm comm ); -template void Reduce( const unsigned long long* sbuf, unsigned long long* rbuf, int count, int root, Comm comm ); -#endif -template void Reduce( const float* sbuf, float* rbuf, int count, int root, Comm comm ); -template void Reduce( const double* sbuf, double* rbuf, int count, int root, Comm comm ); -template void Reduce( const Complex* sbuf, Complex* rbuf, int count, int root, Comm comm ); -template void Reduce( const Complex* sbuf, Complex* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueInt* sbuf, ValueInt* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, int root, Comm comm ); -template void Reduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, int root, Comm comm ); +template void Reduce (const float *sbuf, float *rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const double *sbuf, double *rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const Complex < float >*sbuf, + Complex < float >*rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const Complex < double >*sbuf, + Complex < double >*rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const ValueInt < Int > *sbuf, + ValueInt < Int > *rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const ValueInt < float >*sbuf, + ValueInt < float >*rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const ValueInt < double >*sbuf, + ValueInt < double >*rbuf, int count, + Op op, int root, Comm comm); +template void Reduce (const ValueIntPair < Int > *sbuf, + ValueIntPair < Int > *rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const ValueIntPair < float >*sbuf, + ValueIntPair < float >*rbuf, + int count, Op op, int root, + Comm comm); +template void Reduce (const ValueIntPair < double >*sbuf, + ValueIntPair < double >*rbuf, + int count, Op op, int root, + Comm comm); + +template < typename T > +void Reduce (const T * sbuf, T * rbuf, int count, + int root, Comm comm) +{ + Reduce (sbuf, rbuf, count, mpi::SUM, root, comm); +} -template -T Reduce( T sb, Op op, int root, Comm comm ) -{ +template void Reduce (const byte * sbuf, byte * rbuf, + int count, int root, Comm comm); +template void Reduce (const int *sbuf, int *rbuf, + int count, int root, Comm comm); +template void Reduce (const unsigned *sbuf, + unsigned *rbuf, int count, int root, + Comm comm); +template void Reduce (const long int *sbuf, + long int *rbuf, int count, int root, + Comm comm); +template void Reduce (const unsigned long *sbuf, + unsigned long *rbuf, int count, + int root, Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Reduce (const long long int *sbuf, + long long int *rbuf, int count, + int root, Comm comm); +template void Reduce (const unsigned long long *sbuf, + unsigned long long *rbuf, int count, + int root, Comm comm); +#endif +template void Reduce (const float *sbuf, float *rbuf, + int count, int root, Comm comm); +template void Reduce (const double *sbuf, double *rbuf, + int count, int root, Comm comm); +template void Reduce (const Complex < float >*sbuf, + Complex < float >*rbuf, int count, + int root, Comm comm); +template void Reduce (const Complex < double >*sbuf, + Complex < double >*rbuf, int count, + int root, Comm comm); +template void Reduce (const ValueInt < Int > *sbuf, + ValueInt < Int > *rbuf, int count, + int root, Comm comm); +template void Reduce (const ValueInt < float >*sbuf, + ValueInt < float >*rbuf, int count, + int root, Comm comm); +template void Reduce (const ValueInt < double >*sbuf, + ValueInt < double >*rbuf, int count, + int root, Comm comm); +template void Reduce (const ValueIntPair < Int > *sbuf, + ValueIntPair < Int > *rbuf, + int count, int root, Comm comm); +template void Reduce (const ValueIntPair < float >*sbuf, + ValueIntPair < float >*rbuf, + int count, int root, Comm comm); +template void Reduce (const ValueIntPair < double >*sbuf, + ValueIntPair < double >*rbuf, + int count, int root, Comm comm); + +template < typename T > T Reduce (T sb, Op op, int root, + Comm comm) +{ T rb; - Reduce( &sb, &rb, 1, op, root, comm ); + + Reduce (&sb, &rb, 1, op, root, comm); return rb; } -template byte Reduce( byte sb, Op op, int root, Comm comm ); -template int Reduce( int sb, Op op, int root, Comm comm ); -template unsigned Reduce( unsigned sb, Op op, int root, Comm comm ); -template long int Reduce( long int sb, Op op, int root, Comm comm ); -template unsigned long Reduce( unsigned long sb, Op op, int root, Comm comm ); +template byte Reduce (byte sb, Op op, int root, + Comm comm); +template int Reduce (int sb, Op op, int root, Comm comm); +template unsigned Reduce (unsigned sb, Op op, int root, + Comm comm); +template long int Reduce (long int sb, Op op, int root, + Comm comm); +template unsigned long Reduce (unsigned long sb, Op op, + int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int Reduce( long long int sb, Op op, int root, Comm comm ); -template unsigned long long Reduce( unsigned long long sb, Op op, int root, Comm comm ); +template long long int Reduce (long long int sb, Op op, + int root, Comm comm); +template unsigned long long Reduce (unsigned long long sb, + Op op, int root, + Comm comm); #endif -template float Reduce( float sb, Op op, int root, Comm comm ); -template double Reduce( double sb, Op op, int root, Comm comm ); -template Complex Reduce( Complex sb, Op op, int root, Comm comm ); -template Complex Reduce( Complex sb, Op op, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, Op op, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, Op op, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, Op op, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, Op op, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, Op op, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, Op op, int root, Comm comm ); - -template -T Reduce( T sb, int root, Comm comm ) -{ +template float Reduce (float sb, Op op, int root, + Comm comm); +template double Reduce (double sb, Op op, int root, + Comm comm); +template Complex < float >Reduce (Complex < float >sb, + Op op, int root, + Comm comm); +template Complex < double >Reduce (Complex < double >sb, + Op op, int root, + Comm comm); +template ValueInt < Int > Reduce (ValueInt < Int > sb, + Op op, int root, + Comm comm); +template ValueInt < float >Reduce (ValueInt < float >sb, + Op op, int root, + Comm comm); +template ValueInt < double >Reduce (ValueInt < double >sb, + Op op, int root, + Comm comm); +template ValueIntPair < Int > Reduce (ValueIntPair < Int > + sb, Op op, int root, + Comm comm); +template ValueIntPair < float >Reduce (ValueIntPair < + float >sb, Op op, + int root, + Comm comm); +template ValueIntPair < double >Reduce (ValueIntPair < + double >sb, Op op, + int root, + Comm comm); + +template < typename T > T Reduce (T sb, int root, + Comm comm) +{ T rb; - Reduce( &sb, &rb, 1, mpi::SUM, root, comm ); + + Reduce (&sb, &rb, 1, mpi::SUM, root, comm); return rb; } -template byte Reduce( byte sb, int root, Comm comm ); -template int Reduce( int sb, int root, Comm comm ); -template unsigned Reduce( unsigned sb, int root, Comm comm ); -template long int Reduce( long int sb, int root, Comm comm ); -template unsigned long Reduce( unsigned long sb, int root, Comm comm ); +template byte Reduce (byte sb, int root, Comm comm); +template int Reduce (int sb, int root, Comm comm); +template unsigned Reduce (unsigned sb, int root, + Comm comm); +template long int Reduce (long int sb, int root, + Comm comm); +template unsigned long Reduce (unsigned long sb, int root, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int Reduce( long long int sb, int root, Comm comm ); -template unsigned long long Reduce( unsigned long long sb, int root, Comm comm ); +template long long int Reduce (long long int sb, int root, + Comm comm); +template unsigned long long Reduce (unsigned long long sb, + int root, Comm comm); #endif -template float Reduce( float sb, int root, Comm comm ); -template double Reduce( double sb, int root, Comm comm ); -template Complex Reduce( Complex sb, int root, Comm comm ); -template Complex Reduce( Complex sb, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, int root, Comm comm ); -template ValueInt Reduce( ValueInt sb, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, int root, Comm comm ); -template ValueIntPair Reduce( ValueIntPair sb, int root, Comm comm ); - -template -void Reduce( T* buf, int count, Op op, int root, Comm comm ) +template float Reduce (float sb, int root, Comm comm); +template double Reduce (double sb, int root, Comm comm); +template Complex < float >Reduce (Complex < float >sb, + int root, Comm comm); +template Complex < double >Reduce (Complex < double >sb, + int root, Comm comm); +template ValueInt < Int > Reduce (ValueInt < Int > sb, + int root, Comm comm); +template ValueInt < float >Reduce (ValueInt < float >sb, + int root, Comm comm); +template ValueInt < double >Reduce (ValueInt < double >sb, + int root, Comm comm); +template ValueIntPair < Int > Reduce (ValueIntPair < Int > + sb, int root, + Comm comm); +template ValueIntPair < float >Reduce (ValueIntPair < + float >sb, + int root, + Comm comm); +template ValueIntPair < double >Reduce (ValueIntPair < + double >sb, + int root, + Comm comm); + +template < typename T > +void Reduce (T * buf, int count, Op op, int root, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Reduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Reduce")) + if (count != 0) { - const int commRank = Rank( comm ); - if( commRank == root ) + const int commRank = Rank (comm); + + if (commRank == root) { #ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce - ( MPI_IN_PLACE, buf, count, TypeMap(), op.op, root, - comm.comm ) ); + (MPI_Reduce + (MPI_IN_PLACE, buf, count, + TypeMap < T > (), op.op, root, + comm.comm)); #else vector sendBuf( count ); MemCopy( sendBuf.data(), buf, count ); SafeMpi - ( MPI_Reduce - ( sendBuf.data(), buf, count, TypeMap(), op.op, root, - comm.comm ) ); + (MPI_Reduce + (sendBuf.data (), buf, count, + TypeMap < T > (), op.op, root, + comm.comm)); #endif } else SafeMpi - ( MPI_Reduce - ( buf, 0, count, TypeMap(), op.op, root, comm.comm ) ); + (MPI_Reduce + (buf, 0, count, TypeMap < T > (), + op.op, root, comm.comm)); } } -template -void Reduce( Complex* buf, int count, Op op, int root, Comm comm ) +template < typename R > +void Reduce (Complex < R > *buf, int count, Op op, + int root, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::Reduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::Reduce")) + if (count != 0) { - const int commRank = Rank( comm ); + const int commRank = Rank (comm); + #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { - if( commRank == root ) + if (commRank == root) { # ifdef EL_HAVE_MPI_IN_PLACE SafeMpi @@ -2171,279 +4148,452 @@ void Reduce( Complex* buf, int count, Op op, int root, Comm comm ) ( MPI_IN_PLACE, buf, 2*count, TypeMap(), op.op, root, comm.comm ) ); # else - vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); - SafeMpi - ( MPI_Reduce - ( sendBuf.data(), buf, 2*count, TypeMap(), op.op, - root, comm.comm ) ); -# endif + std::vector < Complex < + R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, + count); + SafeMpi (MPI_Reduce + (sendBuf.data (), buf, + 2 * count, + TypeMap < R > (), op.op, + root, comm.comm)); +#endif } else SafeMpi - ( MPI_Reduce - ( buf, 0, 2*count, TypeMap(), op.op, root, comm.comm ) ); + (MPI_Reduce + (buf, 0, 2 * count, + TypeMap < R > (), op.op, root, + comm.comm)); } else { - if( commRank == root ) + if (commRank == root) { -# ifdef EL_HAVE_MPI_IN_PLACE - SafeMpi - ( MPI_Reduce - ( MPI_IN_PLACE, buf, count, TypeMap>(), op.op, - root, comm.comm ) ); -# else - vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce - ( sendBuf.data(), buf, count, TypeMap>(), op.op, - root, comm.comm ) ); -# endif + (MPI_Reduce + (MPI_IN_PLACE, buf, count, + TypeMap < Complex < R >> (), + op.op, root, comm.comm)); +#else + std::vector < Complex < + R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, + count); + SafeMpi (MPI_Reduce + (sendBuf.data (), buf, + count, + TypeMap < Complex < + R >> (), op.op, root, + comm.comm)); +#endif } else SafeMpi - ( MPI_Reduce - ( buf, 0, count, TypeMap>(), op.op, - root, comm.comm ) ); + (MPI_Reduce + (buf, 0, count, + TypeMap < Complex < R >> (), + op.op, root, comm.comm)); } #else - if( commRank == root ) + if (commRank == root) { # ifdef EL_HAVE_MPI_IN_PLACE SafeMpi ( MPI_Reduce ( MPI_IN_PLACE, buf, count, TypeMap>(), op.op, root, comm.comm ) ); -# else - vector> sendBuf( count ); - MemCopy( sendBuf.data(), buf, count ); - SafeMpi - ( MPI_Reduce - ( sendBuf.data(), buf, count, TypeMap>(), op.op, - root, comm.comm ) ); -# endif +#else + std::vector < Complex < + R >> sendBuf (count); + MemCopy (sendBuf.data (), buf, count); + SafeMpi + (MPI_Reduce + (sendBuf.data (), buf, count, + TypeMap < Complex < R >> (), op.op, + root, comm.comm)); +#endif } else SafeMpi - ( MPI_Reduce - ( buf, 0, count, TypeMap>(), op.op, root, - comm.comm ) ); + (MPI_Reduce + (buf, 0, count, + TypeMap < Complex < R >> (), op.op, + root, comm.comm)); #endif } } -template void Reduce( byte* buf, int count, Op op, int root, Comm comm ); -template void Reduce( int* buf, int count, Op op, int root, Comm comm ); -template void Reduce( unsigned* buf, int count, Op op, int root, Comm comm ); -template void Reduce( long int* buf, int count, Op op, int root, Comm comm ); -template void Reduce( unsigned long* buf, int count, Op op, int root, Comm comm ); +template void Reduce (byte * buf, int count, Op op, + int root, Comm comm); +template void Reduce (int *buf, int count, Op op, + int root, Comm comm); +template void Reduce (unsigned *buf, int count, Op op, + int root, Comm comm); +template void Reduce (long int *buf, int count, Op op, + int root, Comm comm); +template void Reduce (unsigned long *buf, int count, + Op op, int root, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void Reduce( long long int* buf, int count, Op op, int root, Comm comm ); -template void Reduce( unsigned long long* buf, int count, Op op, int root, Comm comm ); +template void Reduce (long long int *buf, int count, + Op op, int root, Comm comm); +template void Reduce (unsigned long long *buf, int count, + Op op, int root, Comm comm); #endif -template void Reduce( float* buf, int count, Op op, int root, Comm comm ); -template void Reduce( double* buf, int count, Op op, int root, Comm comm ); -template void Reduce( Complex* buf, int count, Op op, int root, Comm comm ); -template void Reduce( Complex* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, Op op, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, Op op, int root, Comm comm ); - -template -void Reduce( T* buf, int count, int root, Comm comm ) -{ Reduce( buf, count, mpi::SUM, root, comm ); } - -template void Reduce( byte* buf, int count, int root, Comm comm ); -template void Reduce( int* buf, int count, int root, Comm comm ); -template void Reduce( unsigned* buf, int count, int root, Comm comm ); -template void Reduce( long int* buf, int count, int root, Comm comm ); -template void Reduce( unsigned long* buf, int count, int root, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void Reduce( long long int* buf, int count, int root, Comm comm ); -template void Reduce( unsigned long long* buf, int count, int root, Comm comm ); -#endif -template void Reduce( float* buf, int count, int root, Comm comm ); -template void Reduce( double* buf, int count, int root, Comm comm ); -template void Reduce( Complex* buf, int count, int root, Comm comm ); -template void Reduce( Complex* buf, int count, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, int root, Comm comm ); -template void Reduce( ValueInt* buf, int count, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, int root, Comm comm ); -template void Reduce( ValueIntPair* buf, int count, int root, Comm comm ); +template void Reduce (float *buf, int count, Op op, + int root, Comm comm); +template void Reduce (double *buf, int count, Op op, + int root, Comm comm); +template void Reduce (Complex < float >*buf, int count, + Op op, int root, Comm comm); +template void Reduce (Complex < double >*buf, int count, + Op op, int root, Comm comm); +template void Reduce (ValueInt < Int > *buf, int count, + Op op, int root, Comm comm); +template void Reduce (ValueInt < float >*buf, int count, + Op op, int root, Comm comm); +template void Reduce (ValueInt < double >*buf, int count, + Op op, int root, Comm comm); +template void Reduce (ValueIntPair < Int > *buf, + int count, Op op, int root, + Comm comm); +template void Reduce (ValueIntPair < float >*buf, + int count, Op op, int root, + Comm comm); +template void Reduce (ValueIntPair < double >*buf, + int count, Op op, int root, + Comm comm); + +template < typename T > +void Reduce (T * buf, int count, int root, Comm comm) +{ + Reduce (buf, count, mpi::SUM, root, comm); +} -template -void AllReduce( const T* sbuf, T* rbuf, int count, Op op, Comm comm ) +template void Reduce (byte * buf, int count, int root, + Comm comm); +template void Reduce (int *buf, int count, int root, + Comm comm); +template void Reduce (unsigned *buf, int count, int root, + Comm comm); +template void Reduce (long int *buf, int count, int root, + Comm comm); +template void Reduce (unsigned long *buf, int count, + int root, Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template void Reduce (long long int *buf, int count, + int root, Comm comm); +template void Reduce (unsigned long long *buf, int count, + int root, Comm comm); +#endif +template void Reduce (float *buf, int count, int root, + Comm comm); +template void Reduce (double *buf, int count, int root, + Comm comm); +template void Reduce (Complex < float >*buf, int count, + int root, Comm comm); +template void Reduce (Complex < double >*buf, int count, + int root, Comm comm); +template void Reduce (ValueInt < Int > *buf, int count, + int root, Comm comm); +template void Reduce (ValueInt < float >*buf, int count, + int root, Comm comm); +template void Reduce (ValueInt < double >*buf, int count, + int root, Comm comm); +template void Reduce (ValueIntPair < Int > *buf, + int count, int root, Comm comm); +template void Reduce (ValueIntPair < float >*buf, + int count, int root, Comm comm); +template void Reduce (ValueIntPair < double >*buf, + int count, int root, Comm comm); + +template < typename T > +void AllReduce (const T * sbuf, T * rbuf, int count, + Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllReduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllReduce")) + if (count != 0) { - SafeMpi - ( MPI_Allreduce - ( const_cast(sbuf), rbuf, count, TypeMap(), op.op, - comm.comm ) ); + SafeMpi (MPI_Allreduce + (const_cast < T * >(sbuf), rbuf, count, + TypeMap < T > (), op.op, comm.comm)); } } -template +template < typename R > void AllReduce -( const Complex* sbuf, Complex* rbuf, int count, Op op, Comm comm ) +(const Complex < R > *sbuf, Complex < R > *rbuf, + int count, Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllReduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllReduce")) + if (count != 0) { #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { SafeMpi - ( MPI_Allreduce - ( const_cast*>(sbuf), - rbuf, 2*count, TypeMap(), op.op, comm.comm ) ); + (MPI_Allreduce + (const_cast < Complex < R > *>(sbuf), + rbuf, 2 * count, TypeMap < R > (), + op.op, comm.comm)); } else { SafeMpi - ( MPI_Allreduce - ( const_cast*>(sbuf), - rbuf, count, TypeMap>(), op.op, comm.comm ) ); + (MPI_Allreduce + (const_cast < Complex < R > *>(sbuf), + rbuf, count, + TypeMap < Complex < R >> (), op.op, + comm.comm)); } #else SafeMpi - ( MPI_Allreduce - ( const_cast*>(sbuf), - rbuf, count, TypeMap>(), op.op, comm.comm ) ); + (MPI_Allreduce + (const_cast < Complex < R > *>(sbuf), + rbuf, count, TypeMap < Complex < R >> (), + op.op, comm.comm)); #endif } } -template void AllReduce( const byte* sbuf, byte* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const int* sbuf, int* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const unsigned* sbuf, unsigned* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const long int* sbuf, long int* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const unsigned long* sbuf, unsigned long* rbuf, int count, Op op, Comm comm ); +template void AllReduce (const byte * sbuf, byte * rbuf, + int count, Op op, Comm comm); +template void AllReduce (const int *sbuf, int *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const unsigned *sbuf, + unsigned *rbuf, int count, Op op, + Comm comm); +template void AllReduce (const long int *sbuf, + long int *rbuf, int count, Op op, + Comm comm); +template void AllReduce (const unsigned long *sbuf, + unsigned long *rbuf, int count, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void AllReduce( const long long int* sbuf, long long int* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const unsigned long long* sbuf, unsigned long long* rbuf, int count, Op op, Comm comm ); +template void AllReduce (const long long int *sbuf, + long long int *rbuf, int count, + Op op, Comm comm); +template void AllReduce (const unsigned long long *sbuf, + unsigned long long *rbuf, + int count, Op op, Comm comm); #endif -template void AllReduce( const float* sbuf, float* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const double* sbuf, double* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const Complex* sbuf, Complex* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const Complex* sbuf, Complex* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Op op, Comm comm ); +template void AllReduce (const float *sbuf, float *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const double *sbuf, double *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const Complex < float >*sbuf, + Complex < float >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const Complex < double >*sbuf, + Complex < double >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueInt < Int > *sbuf, + ValueInt < Int > *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueInt < float >*sbuf, + ValueInt < float >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueInt < double >*sbuf, + ValueInt < double >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueIntPair < Int > *sbuf, + ValueIntPair < Int > *rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueIntPair < + float >*sbuf, + ValueIntPair < float >*rbuf, + int count, Op op, Comm comm); +template void AllReduce (const ValueIntPair < + double >*sbuf, + ValueIntPair < double >*rbuf, + int count, Op op, Comm comm); + +template < typename T > +void AllReduce (const T * sbuf, T * rbuf, int count, + Comm comm) +{ + AllReduce (sbuf, rbuf, count, mpi::SUM, comm); +} -template -void AllReduce( const T* sbuf, T* rbuf, int count, Comm comm ) -{ AllReduce( sbuf, rbuf, count, mpi::SUM, comm ); } - -template void AllReduce( const byte* sbuf, byte* rbuf, int count, Comm comm ); -template void AllReduce( const int* sbuf, int* rbuf, int count, Comm comm ); -template void AllReduce( const unsigned* sbuf, unsigned* rbuf, int count, Comm comm ); -template void AllReduce( const long int* sbuf, long int* rbuf, int count, Comm comm ); -template void AllReduce( const unsigned long* sbuf, unsigned long* rbuf, int count, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void AllReduce( const long long int* sbuf, long long int* rbuf, int count, Comm comm ); -template void AllReduce( const unsigned long long* sbuf, unsigned long long* rbuf, int count, Comm comm ); -#endif -template void AllReduce( const float* sbuf, float* rbuf, int count, Comm comm ); -template void AllReduce( const double* sbuf, double* rbuf, int count, Comm comm ); -template void AllReduce( const Complex* sbuf, Complex* rbuf, int count, Comm comm ); -template void AllReduce( const Complex* sbuf, Complex* rbuf, int count, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Comm comm ); -template void AllReduce( const ValueInt* sbuf, ValueInt* rbuf, int count, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Comm comm ); -template void AllReduce( const ValueIntPair* sbuf, ValueIntPair* rbuf, int count, Comm comm ); +template void AllReduce (const byte * sbuf, byte * rbuf, + int count, Comm comm); +template void AllReduce (const int *sbuf, int *rbuf, + int count, Comm comm); +template void AllReduce (const unsigned *sbuf, + unsigned *rbuf, int count, + Comm comm); +template void AllReduce (const long int *sbuf, + long int *rbuf, int count, + Comm comm); +template void AllReduce (const unsigned long *sbuf, + unsigned long *rbuf, int count, + Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template void AllReduce (const long long int *sbuf, + long long int *rbuf, int count, + Comm comm); +template void AllReduce (const unsigned long long *sbuf, + unsigned long long *rbuf, + int count, Comm comm); +#endif +template void AllReduce (const float *sbuf, float *rbuf, + int count, Comm comm); +template void AllReduce (const double *sbuf, double *rbuf, + int count, Comm comm); +template void AllReduce (const Complex < float >*sbuf, + Complex < float >*rbuf, + int count, Comm comm); +template void AllReduce (const Complex < double >*sbuf, + Complex < double >*rbuf, + int count, Comm comm); +template void AllReduce (const ValueInt < Int > *sbuf, + ValueInt < Int > *rbuf, + int count, Comm comm); +template void AllReduce (const ValueInt < float >*sbuf, + ValueInt < float >*rbuf, + int count, Comm comm); +template void AllReduce (const ValueInt < double >*sbuf, + ValueInt < double >*rbuf, + int count, Comm comm); +template void AllReduce (const ValueIntPair < Int > *sbuf, + ValueIntPair < Int > *rbuf, + int count, Comm comm); +template void AllReduce (const ValueIntPair < + float >*sbuf, + ValueIntPair < float >*rbuf, + int count, Comm comm); +template void AllReduce (const ValueIntPair < + double >*sbuf, + ValueIntPair < double >*rbuf, + int count, Comm comm); + +template < typename T > T AllReduce (T sb, Op op, + Comm comm) +{ + T rb; -template -T AllReduce( T sb, Op op, Comm comm ) -{ T rb; AllReduce( &sb, &rb, 1, op, comm ); return rb; } - -template byte AllReduce( byte sb, Op op, Comm comm ); -template int AllReduce( int sb, Op op, Comm comm ); -template unsigned AllReduce( unsigned sb, Op op, Comm comm ); -template long int AllReduce( long int sb, Op op, Comm comm ); -template unsigned long AllReduce( unsigned long sb, Op op, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template long long int AllReduce( long long int sb, Op op, Comm comm ); -template unsigned long long AllReduce( unsigned long long sb, Op op, Comm comm ); -#endif -template float AllReduce( float sb, Op op, Comm comm ); -template double AllReduce( double sb, Op op, Comm comm ); -template Complex AllReduce( Complex sb, Op op, Comm comm ); -template Complex AllReduce( Complex sb, Op op, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Op op, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Op op, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Op op, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Op op, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Op op, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Op op, Comm comm ); + AllReduce (&sb, &rb, 1, op, comm); + return rb; +} -template -T AllReduce( T sb, Comm comm ) -{ return AllReduce( sb, mpi::SUM, comm ); } - -template byte AllReduce( byte sb, Comm comm ); -template int AllReduce( int sb, Comm comm ); -template unsigned AllReduce( unsigned sb, Comm comm ); -template long int AllReduce( long int sb, Comm comm ); -template unsigned long AllReduce( unsigned long sb, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template long long int AllReduce( long long int sb, Comm comm ); -template unsigned long long AllReduce( unsigned long long sb, Comm comm ); -#endif -template float AllReduce( float sb, Comm comm ); -template double AllReduce( double sb, Comm comm ); -template Complex AllReduce( Complex sb, Comm comm ); -template Complex AllReduce( Complex sb, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Comm comm ); -template ValueInt AllReduce( ValueInt sb, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Comm comm ); -template ValueIntPair AllReduce( ValueIntPair sb, Comm comm ); +template byte AllReduce (byte sb, Op op, Comm comm); +template int AllReduce (int sb, Op op, Comm comm); +template unsigned AllReduce (unsigned sb, Op op, + Comm comm); +template long int AllReduce (long int sb, Op op, + Comm comm); +template unsigned long AllReduce (unsigned long sb, Op op, + Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template long long int AllReduce (long long int sb, Op op, + Comm comm); +template unsigned long long AllReduce (unsigned long long + sb, Op op, + Comm comm); +#endif +template float AllReduce (float sb, Op op, Comm comm); +template double AllReduce (double sb, Op op, Comm comm); +template Complex < float >AllReduce (Complex < float >sb, + Op op, Comm comm); +template Complex < double >AllReduce (Complex < + double >sb, Op op, + Comm comm); +template ValueInt < Int > AllReduce (ValueInt < Int > sb, + Op op, Comm comm); +template ValueInt < float >AllReduce (ValueInt < + float >sb, Op op, + Comm comm); +template ValueInt < double >AllReduce (ValueInt < + double >sb, Op op, + Comm comm); +template ValueIntPair < Int > AllReduce (ValueIntPair < + Int > sb, Op op, + Comm comm); +template ValueIntPair < float >AllReduce (ValueIntPair < + float >sb, + Op op, + Comm comm); +template ValueIntPair < double >AllReduce (ValueIntPair < + double >sb, + Op op, + Comm comm); + +template < typename T > T AllReduce (T sb, Comm comm) +{ + return AllReduce (sb, mpi::SUM, comm); +} -template -void AllReduce( T* buf, int count, Op op, Comm comm ) +template byte AllReduce (byte sb, Comm comm); +template int AllReduce (int sb, Comm comm); +template unsigned AllReduce (unsigned sb, Comm comm); +template long int AllReduce (long int sb, Comm comm); +template unsigned long AllReduce (unsigned long sb, + Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template long long int AllReduce (long long int sb, + Comm comm); +template unsigned long long AllReduce (unsigned long long + sb, Comm comm); +#endif +template float AllReduce (float sb, Comm comm); +template double AllReduce (double sb, Comm comm); +template Complex < float >AllReduce (Complex < float >sb, + Comm comm); +template Complex < double >AllReduce (Complex < + double >sb, + Comm comm); +template ValueInt < Int > AllReduce (ValueInt < Int > sb, + Comm comm); +template ValueInt < float >AllReduce (ValueInt < + float >sb, + Comm comm); +template ValueInt < double >AllReduce (ValueInt < + double >sb, + Comm comm); +template ValueIntPair < Int > AllReduce (ValueIntPair < + Int > sb, + Comm comm); +template ValueIntPair < float >AllReduce (ValueIntPair < + float >sb, + Comm comm); +template ValueIntPair < double >AllReduce (ValueIntPair < + double >sb, + Comm comm); + +template < typename T > +void AllReduce (T * buf, int count, Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllReduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllReduce")) + if (count != 0) { #ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Allreduce - ( MPI_IN_PLACE, buf, count, TypeMap(), op.op, comm.comm ) ); + (MPI_Allreduce + (MPI_IN_PLACE, buf, count, + TypeMap < T > (), op.op, comm.comm)); #else vector sendBuf( count ); MemCopy( sendBuf.data(), buf, count ); SafeMpi - ( MPI_Allreduce - ( sendBuf.data(), buf, count, TypeMap(), op.op, comm.comm ) ); + (MPI_Allreduce + (sendBuf.data (), buf, count, + TypeMap < T > (), op.op, comm.comm)); #endif } } -template -void AllReduce( Complex* buf, int count, Op op, Comm comm ) +template < typename R > +void AllReduce (Complex < R > *buf, int count, Op op, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::AllReduce")) - if( count != 0 ) + DEBUG_ONLY (CallStackEntry cse ("mpi::AllReduce")) + if (count != 0) { #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi ( MPI_Allreduce ( MPI_IN_PLACE, buf, 2*count, TypeMap(), op.op, comm.comm ) ); @@ -2451,14 +4601,15 @@ void AllReduce( Complex* buf, int count, Op op, Comm comm ) vector> sendBuf( count ); MemCopy( sendBuf.data(), buf, count ); SafeMpi - ( MPI_Allreduce - ( sendBuf.data(), buf, 2*count, TypeMap(), op.op, - comm.comm ) ); -# endif + (MPI_Allreduce + (sendBuf.data (), buf, 2 * count, + TypeMap < R > (), op.op, + comm.comm)); +#endif } else { -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi ( MPI_Allreduce ( MPI_IN_PLACE, buf, count, TypeMap>(), @@ -2467,13 +4618,14 @@ void AllReduce( Complex* buf, int count, Op op, Comm comm ) vector> sendBuf( count ); MemCopy( sendBuf.data(), buf, count ); SafeMpi - ( MPI_Allreduce - ( sendBuf.data(), buf, count, TypeMap>(), - op.op, comm.comm ) ); -# endif + (MPI_Allreduce + (sendBuf.data (), buf, count, + TypeMap < Complex < R >> (), op.op, + comm.comm)); +#endif } #else -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi ( MPI_Allreduce ( MPI_IN_PLACE, buf, count, TypeMap>(), op.op, @@ -2482,185 +4634,296 @@ void AllReduce( Complex* buf, int count, Op op, Comm comm ) vector> sendBuf( count ); MemCopy( sendBuf.data(), buf, count ); SafeMpi - ( MPI_Allreduce - ( sendBuf.data(), buf, count, TypeMap>(), op.op, - comm.comm ) ); -# endif + (MPI_Allreduce + (sendBuf.data (), buf, count, + TypeMap < Complex < R >> (), op.op, + comm.comm)); +#endif #endif } } -template void AllReduce( byte* buf, int count, Op op, Comm comm ); -template void AllReduce( int* buf, int count, Op op, Comm comm ); -template void AllReduce( unsigned* buf, int count, Op op, Comm comm ); -template void AllReduce( long int* buf, int count, Op op, Comm comm ); -template void AllReduce( unsigned long* buf, int count, Op op, Comm comm ); +template void AllReduce (byte * buf, int count, Op op, + Comm comm); +template void AllReduce (int *buf, int count, Op op, + Comm comm); +template void AllReduce (unsigned *buf, int count, Op op, + Comm comm); +template void AllReduce (long int *buf, int count, Op op, + Comm comm); +template void AllReduce (unsigned long *buf, int count, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void AllReduce( long long int* buf, int count, Op op, Comm comm ); -template void AllReduce( unsigned long long* buf, int count, Op op, Comm comm ); +template void AllReduce (long long int *buf, int count, + Op op, Comm comm); +template void AllReduce (unsigned long long *buf, + int count, Op op, Comm comm); #endif -template void AllReduce( float* buf, int count, Op op, Comm comm ); -template void AllReduce( double* buf, int count, Op op, Comm comm ); -template void AllReduce( Complex* buf, int count, Op op, Comm comm ); -template void AllReduce( Complex* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Op op, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Op op, Comm comm ); - -template -void AllReduce( T* buf, int count, Comm comm ) -{ AllReduce( buf, count, mpi::SUM, comm ); } - -template void AllReduce( byte* buf, int count, Comm comm ); -template void AllReduce( int* buf, int count, Comm comm ); -template void AllReduce( unsigned* buf, int count, Comm comm ); -template void AllReduce( long int* buf, int count, Comm comm ); -template void AllReduce( unsigned long* buf, int count, Comm comm ); -#ifdef EL_HAVE_MPI_LONG_LONG -template void AllReduce( long long int* buf, int count, Comm comm ); -template void AllReduce( unsigned long long* buf, int count, Comm comm ); -#endif -template void AllReduce( float* buf, int count, Comm comm ); -template void AllReduce( double* buf, int count, Comm comm ); -template void AllReduce( Complex* buf, int count, Comm comm ); -template void AllReduce( Complex* buf, int count, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Comm comm ); -template void AllReduce( ValueInt* buf, int count, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Comm comm ); -template void AllReduce( ValueIntPair* buf, int count, Comm comm ); +template void AllReduce (float *buf, int count, Op op, + Comm comm); +template void AllReduce (double *buf, int count, Op op, + Comm comm); +template void AllReduce (Complex < float >*buf, int count, + Op op, Comm comm); +template void AllReduce (Complex < double >*buf, + int count, Op op, Comm comm); +template void AllReduce (ValueInt < Int > *buf, int count, + Op op, Comm comm); +template void AllReduce (ValueInt < float >*buf, + int count, Op op, Comm comm); +template void AllReduce (ValueInt < double >*buf, + int count, Op op, Comm comm); +template void AllReduce (ValueIntPair < Int > *buf, + int count, Op op, Comm comm); +template void AllReduce (ValueIntPair < float >*buf, + int count, Op op, Comm comm); +template void AllReduce (ValueIntPair < double >*buf, + int count, Op op, Comm comm); + +template < typename T > +void AllReduce (T * buf, int count, Comm comm) +{ + AllReduce (buf, count, mpi::SUM, comm); +} -template -void ReduceScatter( R* sbuf, R* rbuf, int rc, Op op, Comm comm ) +template void AllReduce (byte * buf, int count, + Comm comm); +template void AllReduce (int *buf, int count, Comm comm); +template void AllReduce (unsigned *buf, int count, + Comm comm); +template void AllReduce (long int *buf, int count, + Comm comm); +template void AllReduce (unsigned long *buf, int count, + Comm comm); +#ifdef EL_HAVE_MPI_LONG_LONG +template void AllReduce (long long int *buf, int count, + Comm comm); +template void AllReduce (unsigned long long *buf, + int count, Comm comm); +#endif +template void AllReduce (float *buf, int count, + Comm comm); +template void AllReduce (double *buf, int count, + Comm comm); +template void AllReduce (Complex < float >*buf, int count, + Comm comm); +template void AllReduce (Complex < double >*buf, + int count, Comm comm); +template void AllReduce (ValueInt < Int > *buf, int count, + Comm comm); +template void AllReduce (ValueInt < float >*buf, + int count, Comm comm); +template void AllReduce (ValueInt < double >*buf, + int count, Comm comm); +template void AllReduce (ValueIntPair < Int > *buf, + int count, Comm comm); +template void AllReduce (ValueIntPair < float >*buf, + int count, Comm comm); +template void AllReduce (ValueIntPair < double >*buf, + int count, Comm comm); + +template < typename R > +void ReduceScatter (R * sbuf, R * rbuf, int rc, Op op, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_REDUCE_SCATTER_BLOCK_VIA_ALLREDUCE - const int commSize = Size( comm ); - const int commRank = Rank( comm ); - AllReduce( sbuf, rc*commSize, op, comm ); - MemCopy( rbuf, &sbuf[commRank*rc], rc ); + const int commSize = Size (comm); + const int commRank = Rank (comm); + + AllReduce (sbuf, rc * commSize, op, comm); + MemCopy (rbuf, &sbuf[commRank * rc], rc); #elif defined(EL_HAVE_MPI_REDUCE_SCATTER_BLOCK) SafeMpi - ( MPI_Reduce_scatter_block - ( sbuf, rbuf, rc, TypeMap(), op.op, comm.comm ) ); + (MPI_Reduce_scatter_block + (sbuf, rbuf, rc, TypeMap < R > (), op.op, + comm.comm)); #else - const int commSize = Size( comm ); - Reduce( sbuf, rc*commSize, op, 0, comm ); - Scatter( sbuf, rc, rbuf, rc, 0, comm ); + const int commSize = Size (comm); + + Reduce (sbuf, rc * commSize, op, 0, comm); + Scatter (sbuf, rc, rbuf, rc, 0, comm); #endif } -template +template < typename R > void ReduceScatter -( Complex* sbuf, Complex* rbuf, int rc, Op op, Comm comm ) +(Complex < R > *sbuf, Complex < R > *rbuf, int rc, + Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_REDUCE_SCATTER_BLOCK_VIA_ALLREDUCE - const int commSize = Size( comm ); - const int commRank = Rank( comm ); - AllReduce( sbuf, rc*commSize, op, comm ); - MemCopy( rbuf, &sbuf[commRank*rc], rc ); + const int commSize = Size (comm); + const int commRank = Rank (comm); + + AllReduce (sbuf, rc * commSize, op, comm); + MemCopy (rbuf, &sbuf[commRank * rc], rc); #elif defined(EL_HAVE_MPI_REDUCE_SCATTER_BLOCK) -# ifdef EL_AVOID_COMPLEX_MPI +#ifdef EL_AVOID_COMPLEX_MPI SafeMpi - ( MPI_Reduce_scatter_block - ( sbuf, rbuf, 2*rc, TypeMap(), op.op, comm.comm ) ); -# else + (MPI_Reduce_scatter_block + (sbuf, rbuf, 2 * rc, TypeMap < R > (), op.op, + comm.comm)); +#else SafeMpi - ( MPI_Reduce_scatter_block - ( sbuf, rbuf, rc, TypeMap>(), op.op, comm.comm ) ); -# endif + (MPI_Reduce_scatter_block + (sbuf, rbuf, rc, TypeMap < Complex < R >> (), + op.op, comm.comm)); +#endif #else - const int commSize = Size( comm ); - Reduce( sbuf, rc*commSize, op, 0, comm ); - Scatter( sbuf, rc, rbuf, rc, 0, comm ); + const int commSize = Size (comm); + + Reduce (sbuf, rc * commSize, op, 0, comm); + Scatter (sbuf, rc, rbuf, rc, 0, comm); #endif } -template void ReduceScatter( byte* sbuf, byte* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( int* sbuf, int* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned* sbuf, unsigned* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( long int* sbuf, long int* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned long* sbuf, unsigned long* rbuf, int rc, Op op, Comm comm ); +template void ReduceScatter (byte * sbuf, byte * rbuf, + int rc, Op op, Comm comm); +template void ReduceScatter (int *sbuf, int *rbuf, int rc, + Op op, Comm comm); +template void ReduceScatter (unsigned *sbuf, + unsigned *rbuf, int rc, + Op op, Comm comm); +template void ReduceScatter (long int *sbuf, + long int *rbuf, int rc, + Op op, Comm comm); +template void ReduceScatter (unsigned long *sbuf, + unsigned long *rbuf, int rc, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( long long int* sbuf, long long int* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned long long* sbuf, unsigned long long* rbuf, int rc, Op op, Comm comm ); +template void ReduceScatter (long long int *sbuf, + long long int *rbuf, int rc, + Op op, Comm comm); +template void ReduceScatter (unsigned long long *sbuf, + unsigned long long *rbuf, + int rc, Op op, Comm comm); #endif -template void ReduceScatter( float* sbuf, float* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( double* sbuf, double* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( Complex* sbuf, Complex* rbuf, int rc, Op op, Comm comm ); -template void ReduceScatter( Complex* sbuf, Complex* rbuf, int rc, Op op, Comm comm ); - -template -void ReduceScatter( T* sbuf, T* rbuf, int rc, Comm comm ) -{ ReduceScatter( sbuf, rbuf, rc, mpi::SUM, comm ); } +template void ReduceScatter (float *sbuf, float *rbuf, + int rc, Op op, Comm comm); +template void ReduceScatter (double *sbuf, double *rbuf, + int rc, Op op, Comm comm); +template void ReduceScatter (Complex < float >*sbuf, + Complex < float >*rbuf, + int rc, Op op, Comm comm); +template void ReduceScatter (Complex < double >*sbuf, + Complex < double >*rbuf, + int rc, Op op, Comm comm); + +template < typename T > +void ReduceScatter (T * sbuf, T * rbuf, int rc, + Comm comm) +{ + ReduceScatter (sbuf, rbuf, rc, mpi::SUM, comm); +} -template void ReduceScatter( byte* sbuf, byte* rbuf, int rc, Comm comm ); -template void ReduceScatter( int* sbuf, int* rbuf, int rc, Comm comm ); -template void ReduceScatter( unsigned* sbuf, unsigned* rbuf, int rc, Comm comm ); -template void ReduceScatter( long int* sbuf, long int* rbuf, int rc, Comm comm ); -template void ReduceScatter( unsigned long* sbuf, unsigned long* rbuf, int rc, Comm comm ); +template void ReduceScatter (byte * sbuf, byte * rbuf, + int rc, Comm comm); +template void ReduceScatter (int *sbuf, int *rbuf, int rc, + Comm comm); +template void ReduceScatter (unsigned *sbuf, + unsigned *rbuf, int rc, + Comm comm); +template void ReduceScatter (long int *sbuf, + long int *rbuf, int rc, + Comm comm); +template void ReduceScatter (unsigned long *sbuf, + unsigned long *rbuf, int rc, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( long long int* sbuf, long long int* rbuf, int rc, Comm comm ); -template void ReduceScatter( unsigned long long* sbuf, unsigned long long* rbuf, int rc, Comm comm ); +template void ReduceScatter (long long int *sbuf, + long long int *rbuf, int rc, + Comm comm); +template void ReduceScatter (unsigned long long *sbuf, + unsigned long long *rbuf, + int rc, Comm comm); #endif -template void ReduceScatter( float* sbuf, float* rbuf, int rc, Comm comm ); -template void ReduceScatter( double* sbuf, double* rbuf, int rc, Comm comm ); -template void ReduceScatter( Complex* sbuf, Complex* rbuf, int rc, Comm comm ); -template void ReduceScatter( Complex* sbuf, Complex* rbuf, int rc, Comm comm ); +template void ReduceScatter (float *sbuf, float *rbuf, + int rc, Comm comm); +template void ReduceScatter (double *sbuf, double *rbuf, + int rc, Comm comm); +template void ReduceScatter (Complex < float >*sbuf, + Complex < float >*rbuf, + int rc, Comm comm); +template void ReduceScatter (Complex < double >*sbuf, + Complex < double >*rbuf, + int rc, Comm comm); + +template < typename T > T ReduceScatter (T sb, Op op, + Comm comm) +{ + T rb; -template -T ReduceScatter( T sb, Op op, Comm comm ) -{ T rb; ReduceScatter( &sb, &rb, 1, op, comm ); return rb; } + ReduceScatter (&sb, &rb, 1, op, comm); + return rb; +} -template byte ReduceScatter( byte sb, Op op, Comm comm ); -template int ReduceScatter( int sb, Op op, Comm comm ); -template unsigned ReduceScatter( unsigned sb, Op op, Comm comm ); -template long int ReduceScatter( long int sb, Op op, Comm comm ); -template unsigned long ReduceScatter( unsigned long sb, Op op, Comm comm ); +template byte ReduceScatter (byte sb, Op op, Comm comm); +template int ReduceScatter (int sb, Op op, Comm comm); +template unsigned ReduceScatter (unsigned sb, Op op, + Comm comm); +template long int ReduceScatter (long int sb, Op op, + Comm comm); +template unsigned long ReduceScatter (unsigned long sb, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int ReduceScatter( long long int sb, Op op, Comm comm ); -template unsigned long long ReduceScatter( unsigned long long sb, Op op, Comm comm ); +template long long int ReduceScatter (long long int sb, + Op op, Comm comm); +template unsigned long long ReduceScatter (unsigned long + long sb, Op op, + Comm comm); #endif -template float ReduceScatter( float sb, Op op, Comm comm ); -template double ReduceScatter( double sb, Op op, Comm comm ); -template Complex ReduceScatter( Complex sb, Op op, Comm comm ); -template Complex ReduceScatter( Complex sb, Op op, Comm comm ); - -template -T ReduceScatter( T sb, Comm comm ) -{ return ReduceScatter( sb, mpi::SUM, comm ); } +template float ReduceScatter (float sb, Op op, Comm comm); +template double ReduceScatter (double sb, Op op, + Comm comm); +template Complex < float >ReduceScatter (Complex < + float >sb, Op op, + Comm comm); +template Complex < double >ReduceScatter (Complex < + double >sb, + Op op, + Comm comm); + +template < typename T > T ReduceScatter (T sb, Comm comm) +{ + return ReduceScatter (sb, mpi::SUM, comm); +} -template byte ReduceScatter( byte sb, Comm comm ); -template int ReduceScatter( int sb, Comm comm ); -template unsigned ReduceScatter( unsigned sb, Comm comm ); -template long int ReduceScatter( long int sb, Comm comm ); -template unsigned long ReduceScatter( unsigned long sb, Comm comm ); +template byte ReduceScatter (byte sb, Comm comm); +template int ReduceScatter (int sb, Comm comm); +template unsigned ReduceScatter (unsigned sb, Comm comm); +template long int ReduceScatter (long int sb, Comm comm); +template unsigned long ReduceScatter (unsigned long sb, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template long long int ReduceScatter( long long int sb, Comm comm ); -template unsigned long long ReduceScatter( unsigned long long sb, Comm comm ); +template long long int ReduceScatter (long long int sb, + Comm comm); +template unsigned long long ReduceScatter (unsigned long + long sb, + Comm comm); #endif -template float ReduceScatter( float sb, Comm comm ); -template double ReduceScatter( double sb, Comm comm ); -template Complex ReduceScatter( Complex sb, Comm comm ); -template Complex ReduceScatter( Complex sb, Comm comm ); - -template -void ReduceScatter( R* buf, int rc, Op op, Comm comm ) +template float ReduceScatter (float sb, Comm comm); +template double ReduceScatter (double sb, Comm comm); +template Complex < float >ReduceScatter (Complex < + float >sb, + Comm comm); +template Complex < double >ReduceScatter (Complex < + double >sb, + Comm comm); + +template < typename R > +void ReduceScatter (R * buf, int rc, Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_REDUCE_SCATTER_BLOCK_VIA_ALLREDUCE - const int commSize = Size( comm ); - const int commRank = Rank( comm ); - AllReduce( buf, rc*commSize, op, comm ); - if( commRank != 0 ) - MemCopy( buf, &buf[commRank*rc], rc ); + const int commSize = Size (comm); + const int commRank = Rank (comm); + + AllReduce (buf, rc * commSize, op, comm); + if (commRank != 0) + MemCopy (buf, &buf[commRank * rc], rc); #elif defined(EL_HAVE_MPI_REDUCE_SCATTER_BLOCK) -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi ( MPI_Reduce_scatter_block ( MPI_IN_PLACE, buf, rc, TypeMap(), op.op, comm.comm ) ); @@ -2669,167 +4932,256 @@ void ReduceScatter( R* buf, int rc, Op op, Comm comm ) vector sendBuf( rc*commSize ); MemCopy( sendBuf.data(), buf, rc*commSize ); SafeMpi - ( MPI_Reduce_scatter_block - ( sendBuf.data(), buf, rc, TypeMap(), op.op, comm.comm ) ); -# endif + (MPI_Reduce_scatter_block + (sendBuf.data (), buf, rc, TypeMap < R > (), + op.op, comm.comm)); +#endif #else - const int commSize = Size( comm ); - Reduce( buf, rc*commSize, op, 0, comm ); - Scatter( buf, rc, rc, 0, comm ); + const int commSize = Size (comm); + + Reduce (buf, rc * commSize, op, 0, comm); + Scatter (buf, rc, rc, 0, comm); #endif } // TODO: Handle case where op is not summation -template -void ReduceScatter( Complex* buf, int rc, Op op, Comm comm ) +template < typename R > +void ReduceScatter (Complex < R > *buf, int rc, Op op, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_REDUCE_SCATTER_BLOCK_VIA_ALLREDUCE - const int commSize = Size( comm ); - const int commRank = Rank( comm ); - AllReduce( buf, rc*commSize, op, comm ); - if( commRank != 0 ) - MemCopy( buf, &buf[commRank*rc], rc ); + const int commSize = Size (comm); + const int commRank = Rank (comm); + + AllReduce (buf, rc * commSize, op, comm); + if (commRank != 0) + MemCopy (buf, &buf[commRank * rc], rc); #elif defined(EL_HAVE_MPI_REDUCE_SCATTER_BLOCK) -# ifdef EL_AVOID_COMPLEX_MPI -# ifdef EL_HAVE_MPI_IN_PLACE +#ifdef EL_AVOID_COMPLEX_MPI +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce_scatter_block - ( MPI_IN_PLACE, buf, 2*rc, TypeMap(), op.op, comm.comm ) ); -# else - const int commSize = Size( comm ); - vector> sendBuf( rc*commSize ); - MemCopy( sendBuf.data(), buf, rc*commSize ); + (MPI_Reduce_scatter_block + (MPI_IN_PLACE, buf, 2 * rc, TypeMap < R > (), + op.op, comm.comm)); +#else + const int commSize = Size (comm); + std::vector < Complex < R >> sendBuf (rc * commSize); + MemCopy (sendBuf.data (), buf, rc * commSize); SafeMpi - ( MPI_Reduce_scatter_block - ( sendBuf.data(), buf, 2*rc, TypeMap(), op.op, comm.comm ) ); -# endif -# else -# ifdef EL_HAVE_MPI_IN_PLACE + (MPI_Reduce_scatter_block + (sendBuf.data (), buf, 2 * rc, TypeMap < R > (), + op.op, comm.comm)); +#endif +#else +#ifdef EL_HAVE_MPI_IN_PLACE SafeMpi - ( MPI_Reduce_scatter_block - ( MPI_IN_PLACE, buf, rc, TypeMap>(), op.op, comm.comm ) ); -# else - const int commSize = Size( comm ); - vector> sendBuf( rc*commSize ); - MemCopy( sendBuf.data(), buf, rc*commSize ); + (MPI_Reduce_scatter_block + (MPI_IN_PLACE, buf, rc, + TypeMap < Complex < R >> (), op.op, comm.comm)); +#else + const int commSize = Size (comm); + + std::vector < Complex < R >> sendBuf (rc * commSize); + MemCopy (sendBuf.data (), buf, rc * commSize); SafeMpi - ( MPI_Reduce_scatter_block - ( sendBuf.data(), buf, rc, TypeMap>(), op.op, comm.comm ) ); -# endif -# endif + (MPI_Reduce_scatter_block + (sendBuf.data (), buf, rc, + TypeMap < Complex < R >> (), op.op, comm.comm)); +#endif +#endif #else - const int commSize = Size( comm ); - Reduce( buf, rc*commSize, op, 0, comm ); - Scatter( buf, rc, rc, 0, comm ); + const int commSize = Size (comm); + + Reduce (buf, rc * commSize, op, 0, comm); + Scatter (buf, rc, rc, 0, comm); #endif } -template void ReduceScatter( byte* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( int* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( long int* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned long* buf, int rc, Op op, Comm comm ); +template void ReduceScatter (byte * buf, int rc, Op op, + Comm comm); +template void ReduceScatter (int *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (unsigned *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (long int *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (unsigned long *buf, int rc, + Op op, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( long long int* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( unsigned long long* buf, int rc, Op op, Comm comm ); +template void ReduceScatter (long long int *buf, int rc, + Op op, Comm comm); +template void ReduceScatter (unsigned long long *buf, + int rc, Op op, Comm comm); #endif -template void ReduceScatter( float* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( double* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( Complex* buf, int rc, Op op, Comm comm ); -template void ReduceScatter( Complex* buf, int rc, Op op, Comm comm ); - -template -void ReduceScatter( T* buf, int rc, Comm comm ) -{ ReduceScatter( buf, rc, mpi::SUM, comm ); } +template void ReduceScatter (float *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (double *buf, int rc, Op op, + Comm comm); +template void ReduceScatter (Complex < float >*buf, + int rc, Op op, Comm comm); +template void ReduceScatter (Complex < double >*buf, + int rc, Op op, Comm comm); + +template < typename T > +void ReduceScatter (T * buf, int rc, Comm comm) +{ + ReduceScatter (buf, rc, mpi::SUM, comm); +} -template void ReduceScatter( byte* buf, int rc, Comm comm ); -template void ReduceScatter( int* buf, int rc, Comm comm ); -template void ReduceScatter( unsigned* buf, int rc, Comm comm ); -template void ReduceScatter( long int* buf, int rc, Comm comm ); -template void ReduceScatter( unsigned long* buf, int rc, Comm comm ); +template void ReduceScatter (byte * buf, int rc, + Comm comm); +template void ReduceScatter (int *buf, int rc, Comm comm); +template void ReduceScatter (unsigned *buf, int rc, + Comm comm); +template void ReduceScatter (long int *buf, int rc, + Comm comm); +template void ReduceScatter (unsigned long *buf, int rc, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( long long int* buf, int rc, Comm comm ); -template void ReduceScatter( unsigned long long* buf, int rc, Comm comm ); +template void ReduceScatter (long long int *buf, int rc, + Comm comm); +template void ReduceScatter (unsigned long long *buf, + int rc, Comm comm); #endif -template void ReduceScatter( float* buf, int rc, Comm comm ); -template void ReduceScatter( double* buf, int rc, Comm comm ); -template void ReduceScatter( Complex* buf, int rc, Comm comm ); -template void ReduceScatter( Complex* buf, int rc, Comm comm ); - -template +template void ReduceScatter (float *buf, int rc, + Comm comm); +template void ReduceScatter (double *buf, int rc, + Comm comm); +template void ReduceScatter (Complex < float >*buf, + int rc, Comm comm); +template void ReduceScatter (Complex < double >*buf, + int rc, Comm comm); + +template < typename R > void ReduceScatter -( const R* sbuf, R* rbuf, const int* rcs, Op op, Comm comm ) +(const R * sbuf, R * rbuf, const int *rcs, Op op, + Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) SafeMpi - ( MPI_Reduce_scatter - ( const_cast(sbuf), - rbuf, const_cast(rcs), TypeMap(), op.op, comm.comm ) ); + (MPI_Reduce_scatter + (const_cast < R * >(sbuf), + rbuf, const_cast < int *>(rcs), + TypeMap < R > (), op.op, comm.comm)); } -template +template < typename R > void ReduceScatter -( const Complex* sbuf, Complex* rbuf, const int* rcs, Op op, Comm comm ) +(const Complex < R > *sbuf, Complex < R > *rbuf, + const int *rcs, Op op, Comm comm) { - DEBUG_ONLY(CallStackEntry cse("mpi::ReduceScatter")) + DEBUG_ONLY (CallStackEntry cse ("mpi::ReduceScatter")) #ifdef EL_AVOID_COMPLEX_MPI - if( op == SUM ) + if (op == SUM) { int p; + MPI_Comm_size( comm.comm, &p ); vector rcsDoubled(p); for( int i=0; i*>(sbuf), - rbuf, rcsDoubled.data(), TypeMap(), op.op, comm.comm ) ); + (MPI_Reduce_scatter + (const_cast < Complex < R > *>(sbuf), + rbuf, rcsDoubled.data (), + TypeMap < R > (), op.op, comm.comm)); } else { SafeMpi - ( MPI_Reduce_scatter - ( const_cast*>(sbuf), - rbuf, const_cast(rcs), TypeMap>(), - op.op, comm.comm ) ); + (MPI_Reduce_scatter + (const_cast < Complex < R > *>(sbuf), + rbuf, const_cast < int *>(rcs), + TypeMap < Complex < R >> (), op.op, + comm.comm)); } #else SafeMpi - ( MPI_Reduce_scatter - ( const_cast*>(sbuf), - rbuf, const_cast(rcs), TypeMap>(), op.op, - comm.comm ) ); + (MPI_Reduce_scatter + (const_cast < Complex < R > *>(sbuf), + rbuf, const_cast < int *>(rcs), + TypeMap < Complex < R >> (), op.op, + comm.comm)); #endif } -template void ReduceScatter( const byte* sbuf, byte* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const int* sbuf, int* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const unsigned* sbuf, unsigned* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const long int* sbuf, long int* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const unsigned long* sbuf, unsigned long* rbuf, const int* rcs, Op op, Comm comm ); +template void ReduceScatter (const byte * sbuf, + byte * rbuf, const int *rcs, + Op op, Comm comm); +template void ReduceScatter (const int *sbuf, int *rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const unsigned *sbuf, + unsigned *rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const long int *sbuf, + long int *rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const unsigned long *sbuf, + unsigned long *rbuf, + const int *rcs, Op op, + Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( const long long int* sbuf, long long int* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const unsigned long long* sbuf, unsigned long long* rbuf, const int* rcs, Op op, Comm comm ); +template void ReduceScatter (const long long int *sbuf, + long long int *rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const unsigned long long + *sbuf, + unsigned long long *rbuf, + const int *rcs, Op op, + Comm comm); #endif -template void ReduceScatter( const float* sbuf, float* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const double* sbuf, double* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const Complex* sbuf, Complex* rbuf, const int* rcs, Op op, Comm comm ); -template void ReduceScatter( const Complex* sbuf, Complex* rbuf, const int* rcs, Op op, Comm comm ); - -template -void ReduceScatter( const T* sbuf, T* rbuf, const int* rcs, Comm comm ) -{ ReduceScatter( sbuf, rbuf, rcs, mpi::SUM, comm ); } +template void ReduceScatter (const float *sbuf, + float *rbuf, const int *rcs, + Op op, Comm comm); +template void ReduceScatter (const double *sbuf, + double *rbuf, const int *rcs, + Op op, Comm comm); +template void ReduceScatter (const Complex < float >*sbuf, + Complex < float >*rbuf, + const int *rcs, Op op, + Comm comm); +template void ReduceScatter (const Complex < + double >*sbuf, + Complex < double >*rbuf, + const int *rcs, Op op, + Comm comm); + +template < typename T > +void ReduceScatter (const T * sbuf, T * rbuf, + const int *rcs, Comm comm) +{ + ReduceScatter (sbuf, rbuf, rcs, mpi::SUM, comm); +} -template void ReduceScatter( const byte* sbuf, byte* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const int* sbuf, int* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const unsigned* sbuf, unsigned* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const long int* sbuf, long int* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const unsigned long* sbuf, unsigned long* rbuf, const int* rcs, Comm comm ); +template void ReduceScatter (const byte * sbuf, + byte * rbuf, const int *rcs, + Comm comm); +template void ReduceScatter (const int *sbuf, int *rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const unsigned *sbuf, + unsigned *rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const long int *sbuf, + long int *rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const unsigned long *sbuf, + unsigned long *rbuf, + const int *rcs, Comm comm); #ifdef EL_HAVE_MPI_LONG_LONG -template void ReduceScatter( const long long int* sbuf, long long int* rbuf, const int* rcs, Comm comm ); -template void ReduceScatter( const unsigned long long* sbuf, unsigned long long* rbuf, const int* rcs, Comm comm ); +template void ReduceScatter (const long long int *sbuf, + long long int *rbuf, + const int *rcs, Comm comm); +template void ReduceScatter (const unsigned long long + *sbuf, + unsigned long long *rbuf, + const int *rcs, Comm comm); #endif + template void ReduceScatter( const float* sbuf, float* rbuf, const int* rcs, Comm comm ); template void ReduceScatter( const double* sbuf, double* rbuf, const int* rcs, Comm comm ); template void ReduceScatter( const Complex* sbuf, Complex* rbuf, const int* rcs, Comm comm ); diff --git a/tests/Axpy2.cpp b/tests/Axpy2.cpp new file mode 100644 index 0000000000..ee42728cd7 --- /dev/null +++ b/tests/Axpy2.cpp @@ -0,0 +1,188 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then Flush all, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This is implemented using MPI-3 + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 10 +//#define DIM 1000 +//#define AXPY_DIM 100 +#define DIM 20 +#define AXPY_DIM 4 + +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + AxpyInterface2 < double > Axpy2int; + Axpy2int.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Axpy2int.Iacc (B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Flush all operations from B to DistMatrix + Axpy2int.Flush ( B ); + + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Axpy2int.Iget (C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + Axpy2int.Flush ( C ); + // Collectively detach in order to finish filling process 0's request + Axpy2int.Detach (); + +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + /* + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of A"); + } + } + mpi::Barrier ( comm ); + */ +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} diff --git a/tests/Axpy2b.cpp b/tests/Axpy2b.cpp new file mode 100644 index 0000000000..0ee0ebcaa8 --- /dev/null +++ b/tests/Axpy2b.cpp @@ -0,0 +1,183 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then a Barrier, + * then another epoch where all the ranks + * perform Get (on their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This is implemented using MPI-3 + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 10 +//#define DIM 1000 +//#define AXPY_DIM 100 +#define DIM 20 +#define AXPY_DIM 4 + +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + AxpyInterface2 < double > Axpy2int; + Axpy2int.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + //Axpy2int.Put (B, i, j); + Axpy2int.Acc (B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + //Axpy2int.Get (C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Collectively detach in order to finish filling process 0's request + Axpy2int.Detach (); +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + /* + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of C"); + } + } + mpi::Barrier ( comm ); + */ +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} diff --git a/tests/Makefile.sample b/tests/Makefile.sample new file mode 100644 index 0000000000..4a7920a568 --- /dev/null +++ b/tests/Makefile.sample @@ -0,0 +1,28 @@ +VG_PATH = /home/sg/builds/valgrind +EL_PATH = /home/sg/builds/Elemental-updated +MPILOC = /home/sg/builds/mpich +CPPFLAGS = -g -O3 -pthread -DDEBUG=2 -std=c++11 -Wall -Wno-unused-variable +SRCS = $(wildcard *.cpp) +INCLUDES = -I$(MPILOC)/include -I$(EL_PATH)/include +LINK = -Wl,-rpath=$(EL_PATH)/lib -L$(EL_PATH)/lib -Wl,-rpath=$(MPILOC)/lib -L$(MPILOC)/lib +LIBS = -llapack -lblas -lrt -lm -lmpich -lopa -lmpl -lEl -lpmrrr +NAME = rmaaxpy +NPROCS = 4 + +all: $(NAME) + +$(NAME) : $(SRCS) + $(CXX) $(CPPFLAGS) $(INCLUDES) -o $(NAME) $? $(LINK) $(LIBS) +clean: + rm -f $(NAME) + +run: + $(MPILOC)/bin/mpiexec -n $(NPROCS) ./$(NAME) + +distclean: clean +profclean: + rm -f $(NAME).hpcstruct + rm -rf hpctoolkit-$(NAME)-measurements/ + rm -rf hpctoolkit-$(NAME)-database/ + rm -rf hpctoolkit-$(NAME)-database-*/ + rm -rf workspace/ diff --git a/tests/Rma.cpp b/tests/Rma.cpp new file mode 100644 index 0000000000..26f8647056 --- /dev/null +++ b/tests/Rma.cpp @@ -0,0 +1,187 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then Flush all, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This is implemented using MPI-3 + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 10 +//#define DIM 1000 +//#define AXPY_DIM 100 +#define DIM 20 +#define AXPY_DIM 4 + +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + RmaInterface < double > Rmaint; + Rmaint.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Acc (B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Flush all operations from B to DistMatrix + Rmaint.Flush ( B ); + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Get (C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Get doesn't require flush though + //Rmaint.Flush ( C ); + // Collectively detach in order to finish filling process 0's request + Rmaint.Detach (); +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + /* + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + if (DIM <= 20) + Print (C, "Patch of A"); + } + } + mpi::Barrier ( comm ); + */ +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + + mpi::Finalize (); + return 0; +} diff --git a/tests/core/HFsimulAxpyInt.cpp b/tests/core/HFsimulAxpyInt.cpp new file mode 100644 index 0000000000..15e7f430c4 --- /dev/null +++ b/tests/core/HFsimulAxpyInt.cpp @@ -0,0 +1,208 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This also requires MPI-3, as we have used + * MPI-3 fetch and op to simulate a global + * counter. We could later use MPI-2 version of + * this function if necessary. + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 1 +//#define DIM 1000 +//#define AXPY_DIM 100 +//#define DIM 20 +//#define AXPY_DIM 4 +#define DIM 8 +#define AXPY_DIM 2 + +#define ALPHA 2.0 +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + MPI_Win win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + // Open up a LOCAL_TO_GLOBAL interface to A + AxpyInterface < double >interface; + interface.Attach (LOCAL_TO_GLOBAL, A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + interface.Axpy (ALPHA, B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + + interface.Detach (); + +#if DEBUG > 1 + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + if (DIM <= 20) + Print (A, "Updated distributed A"); + } + } +#endif + // Reattach to A, but in the GLOBAL_TO_LOCAL direction + interface.Attach (GLOBAL_TO_LOCAL, A); + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + // Bring my updated patch to me from DistMatrix + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + interface.Axpy (1.0, C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + interface.Detach (); +#if DEBUG > 1 + if (DIM <= 20 && commSize < 16) + { + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + Print (C, "Patch of A"); + } + } + mpi::Barrier ( comm ); + } + else + { + if ( commRank == 0 && k == (ITER-1) ) + std::cout << "Inifinity norm of local matrix after " + << k+1 << " iterations: " << InfinityNorm ( C ) << "\n"; + } +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken (secs):%lf \n", total_secs); + + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + } + catch (std::exception & e) + { + ReportException (e); + } + + mpi::Finalize (); + return 0; +} diff --git a/tests/core/HFsimulRMAInt.cpp b/tests/core/HFsimulRMAInt.cpp new file mode 100644 index 0000000000..d7dda9d164 --- /dev/null +++ b/tests/core/HFsimulRMAInt.cpp @@ -0,0 +1,194 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +/* + * This test approximates a Hartree-Fock + * application, all the ranks perform Acc + * (or Axpy) on different patches of the + * matrix during an epoch, then Flush all, + * then a Barrier, then another epoch + * where all the ranks perform Get (on + * their patch) + * Some of the MPI functions are not defined + * in El, hence this test mixes MPI routines + * and MPI from El. This is nasty, but at one + * point would be made better. + * This is implemented using MPI-3 + */ +#include "El.hpp" +#include +using namespace El; + +#define ITER 1 +//#define DIM 1000 +//#define AXPY_DIM 100 +//#define DIM 20 +//#define AXPY_DIM 4 +#define DIM 8 +#define AXPY_DIM 2 + +#define ALPHA 2.0 +#define FOP_ROOT 0 + +#if MPI_VERSION < 3 +# error SORRY, THE TEST ONLY WORKS WITH MPI VERSION > 3 +#endif + +long ReadInc (MPI_Win win, MPI_Aint offset, long inc) +{ + long otemp; + MPI_Fetch_and_op (&inc, &otemp, MPI_LONG, FOP_ROOT, offset, MPI_SUM, + win); + MPI_Win_flush (FOP_ROOT, win); + + return otemp; +} + +int main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + mpi::Window win; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + void *win_base; + long counter, next = 0; + + assert (DIM % AXPY_DIM == 0); + + try + { + // Initialization + // Allocate memory and create window for ReadInc + MPI_Win_allocate (sizeof (long), sizeof (long), MPI_INFO_NULL, + comm.comm, &win_base, &win); + memset (win_base, 0, sizeof (long)); + MPI_Win_lock_all (MPI_MODE_NOCHECK, win); + + // Create window + Grid grid (comm); + + // Create an DIM X DIM distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + RmaInterface < double > Rmaint; + Rmaint.Attach (A); + + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + // AXPY into parts of the DistMatrix + counter = ReadInc (win, 0, (long) 1); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Acc (ALPHA, B, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": AXPY patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Flush all operations from B to DistMatrix + Rmaint.Flush ( B ); + mpi::Barrier ( comm ); + // Bring my updated patch to me from DistMatrix + Matrix < double >C; + Zeros (C, AXPY_DIM, AXPY_DIM); + for (int i = 0; i < DIM; i += AXPY_DIM) + { + if (counter == next) + { + for (int j = 0; j < DIM; j += AXPY_DIM) + { + Rmaint.Get (C, i, j); + //Rmaint.LocalAcc (1.0, C, i, j); +#if DEBUG > 2 + std::cout << std::to_string(commRank) + ": GET patch: " + + std::to_string(i) + "," + std::to_string(j) + << std::endl; +#endif + } + counter = ReadInc (win, 0, (long) 1); + } + next++; + } + // Collectively detach in order to finish filling process 0's request + Rmaint.Detach (); + +#if DEBUG > 1 + if (DIM <= 20 && commSize < 16) + { + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + Print (A, "Updated distributed A"); + } + } + mpi::Barrier ( comm ); + for (int j = 0; j < commSize; j++) + { + if (j == commRank) + { + // Process 0 can now locally print its copy of A + Print (C, "Patch of A"); + } + } + mpi::Barrier ( comm ); + } + else + { + if ( commRank == 0 && k == (ITER-1) ) + std::cout << "Inifinity norm of local matrix after " + << k+1 << " iterations: " << InfinityNorm ( C ) << "\n"; + } +#endif + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken (secs):%lf \n", total_secs); + // clear window object for FOP + MPI_Win_unlock_all (win); + MPI_Win_free (&win); + } + catch (std::exception & e) + { + ReportException (e); + } + + mpi::Finalize (); + return 0; +} diff --git a/tests/core/RmaInterface.cpp b/tests/core/RmaInterface.cpp new file mode 100644 index 0000000000..f511f28c62 --- /dev/null +++ b/tests/core/RmaInterface.cpp @@ -0,0 +1,108 @@ +/* + Copyright (c) 2009-2014, Jack Poulson + Copyright (c) 2011, The University of Texas at Austin + Copyright (c) 2014, Sayan Ghosh, University of Houston + All rights reserved. + + This file is part of Elemental and is under the BSD 2-Clause License, + which can be found in the LICENSE file in the root directory, or at +http://opensource.org/licenses/BSD-2-Clause +*/ +#include "El.hpp" +using namespace El; + +#define ITER 10 +//#define DIM 1000 +//#define AXPY_DIM 100 +#define DIM 20 +#define AXPY_DIM 4 +#define ALPHA 2.0 +#include + int +main (int argc, char *argv[]) +{ + Initialize (argc, argv); + mpi::Comm comm = mpi::COMM_WORLD; + const Int commRank = mpi::Rank (comm); + const Int commSize = mpi::Size (comm); + double t1, t2, seconds; + + assert (AXPY_DIM < DIM); + + try + { + Grid grid (comm); + + // Create an 8 x 8 distributed matrix over the given grid + DistMatrix < double, MC, MR > A (DIM, DIM, grid); + + // Set every entry of A to zero + Zeros (A, DIM, DIM); + + // Print the original A + if (DIM <= 20) + Print (A, "Original distributed A"); + + t1 = MPI_Wtime(); + for (Int k = 0; k < ITER; ++k) + { + if (commRank == 0) + std::cout << "Iteration " << k << std::endl; + + RmaInterface < double > Rmaint; + Rmaint.Attach (A); + + // If we are process 0, then create a 3 x 3 identity matrix, B, + // and Axpy it into the bottom-right of A (using alpha=2) + // NOTE: The bottom-right 3 x 3 submatrix starts at the (5,5) + // entry of A. + // NOTE: Every process is free to Axpy as many submatrices as they + // desire at this point. + if (grid.VCRank () == 0) + { + Matrix < double >B (AXPY_DIM, AXPY_DIM); + Identity (B, AXPY_DIM, AXPY_DIM); + //Print (B, "Original B"); + // AXPY is scaled accumulate as in ARMCI + Rmaint.Acc (ALPHA, B, (DIM - AXPY_DIM), (DIM - AXPY_DIM)); + Rmaint.Flush (B, (DIM - AXPY_DIM), (DIM - AXPY_DIM)); + //Print (B, "Updated B"); + } + // + // NOTE: Every process is free to Axpy as many submatrices as they + // desire at this point. + Matrix < double >C; + if (grid.VCRank () == 0) + { + Zeros (C, DIM, DIM); + Rmaint.Get (C, 0, 0); + Rmaint.Flush ( C ); + } + + + // Collectively detach in order to finish filling process 0's request + Rmaint.Detach (); + + if (DIM <= 20) + Print (A, "Updated distributed A"); + // Process 0 can now locally print its copy of A + if (grid.VCRank () == 0 && DIM <= 20) + Print (C, "Process 0's local copy of A"); + } + t2 = MPI_Wtime(); + seconds = (t2 - t1); ///ITER; + double total_secs; + + MPI_Reduce(&seconds, &total_secs, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (commRank == 0) + printf("Time taken for AXPY (secs):%lf \n", total_secs); + } + catch (std::exception & e) + { + ReportException (e); + } + + Finalize (); + return 0; +}