Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vfmadd #289

Merged
merged 8 commits into from
Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fflas-ffpack/fflas/fflas_fadd.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ namespace FFLAS {
struct support_simd_add<float> : public std::true_type {} ;
template<>
struct support_simd_add<double> : public std::true_type {} ;
#ifdef SIMD_INT
#ifdef SIMD_INT // why? PK - 2019/12
template<>
struct support_simd_add<int32_t> : public std::true_type {} ;
template<>
Expand Down
184 changes: 179 additions & 5 deletions fflas-ffpack/fflas/fflas_faxpy.inl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (C) 2005 Clement Pernet
*
* Written by Clement Pernet <[email protected]>
* Pierre Karpman <[email protected]>
*
*
* ========LICENCE========
Expand All @@ -27,18 +28,168 @@
#ifndef __FFLASFFPACK_faxpy_INL
#define __FFLASFFPACK_faxpy_INL

/* most of this is a shameless copy from fscal... */

namespace FFLAS { namespace vectorised { namespace unswitch {

namespace FFLAS {
#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS

template<class Field>
// TODO check if returning in register is better??
template<class Field, class SimdT>
inline void
faxpy( const Field& F, const size_t N,
VEC_FMA(typename SimdT::vect_t & A, typename SimdT::vect_t & X, typename SimdT::vect_t & Y, HelperModSimd<Field,SimdT> & H)
{
Y = SimdT::fmadd(Y,A,X);
VEC_MOD(Y, H);
}

template<class Field>
inline typename std::enable_if<FFLAS::support_simd_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, HelperMod<Field> &G)
{
typedef typename Field::Element Element;
using simd = Simd<Element>;
using vect_t = typename simd::vect_t;
HelperModSimd<Field,simd> H(F,G);
vect_t A, U, V;
A = simd::set1(a);

size_t i = 0;

if (n < simd::vect_size)
{
for (; i < n ; i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
return;
}

long st = long(Y) % simd::alignment;
if (st)
{ // the array Y is not aligned (process few elements until (Y+i) is aligned)
for (size_t j = static_cast<size_t>(st) ; j < simd::alignment ; j+=sizeof(Element), i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
}
FFLASFFPACK_check((long(Y+i) % simd::alignment == 0));
if (long(Y+i)%simd::alignment==0)
P1K marked this conversation as resolved.
Show resolved Hide resolved
{
// perform the loop using SIMD
for (;i <= n - simd::vect_size ; i += simd::vect_size)
{
U = simd::loadu(X+i); // could be transformed into a load at the cost of an additional top check. Worth it?
V = simd::load(Y+i);
VEC_FMA(A, U, V, H);
simd::store(Y+i,V);
}
}
// perform the last elt from T without SIMD
for (; i < n ; i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
}

#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS

template<class Field>
inline typename std::enable_if<!FFLAS::support_simd_mod<typename Field::Element>::value && FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, HelperMod<Field> &H)
{
size_t i = 0;

for (; i < n ; i++)
{
Y[i] = reduce(a*X[i] + Y[i], H);
}
}


template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, const size_t incX, const size_t incY,
HelperMod<Field> &H)
{
typename Field::ConstElement_ptr Xi = X;
typename Field::Element_ptr Yi=Y;
for (; Xi < X+n*incX; Xi+=incX, Yi+=incY )
*Yi = reduce(a*(*Xi) + (*Yi), H);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that the infix operators + and * are legit for this field?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should come from support_fast_mod (also implied by support_simd_mod)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. At some point, one should consider cleaning up these traits. This is #304

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This axpyp should also not be in the namespace vectorised as it does not use vectorization

}

} // unswitch
} // vectorised
} // FFLAS

namespace FFLAS { namespace vectorised {

// simd_mod => fast_mod, so this declaration's one in two
template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n)
{
HelperMod<Field> H(F);

unswitch::axpyp(F,a,X,Y,n,H);
}

template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, const size_t incX, const size_t incY)
{
HelperMod<Field> H(F);

unswitch::axpyp(F,a,X,Y,n,incX,incY,H);
}

} // vectorised
} // FFLAS

namespace FFLAS { namespace details {

/*** Entry-points for specialised code ***/
// simd_mod => fast_mod;
// will default to the best supported option in unswitch

template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
faxpy( const Field & F , const size_t N,
const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY )
typename Field::Element_ptr Y, const size_t incY,
FieldCategories::ModularTag)
{
if((incX == 1) && (incY == 1))
{
vectorised::axpyp(F,a,X,Y,N);
}
else
{
if (N < FFLASFFPACK_COPY_REDUCE) // defined in fflas_freduce.inl
{
vectorised::axpyp(F,a,X,Y,N,incX,incY);
}
else
{
typename Field::Element_ptr Xc = fflas_new (F,N) ;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If only one of X or Y has an inc !=1 then this code copies both vectors anyway!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks indeed wasteful

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be fixed in latest commit

typename Field::Element_ptr Yc = fflas_new (F,N) ;
fassign(F,N,X,incX,Xc,1);
fassign(F,N,Y,incY,Yc,1);
faxpy(F,N,a,Xc,1,Yc,1,FieldCategories::ModularTag());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From JGD: why not a call to vectorised::axpyp directly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be fixed in latest commit

fassign(F,N,Yc,1,Y,incY);
fflas_delete(Xc);
fflas_delete(Yc);
}
}
}

template<class Field, class FC>
inline void
faxpy(const Field& F, const size_t N, const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY, FC)
{
if (F.isZero(a))
return ;

Expand All @@ -51,11 +202,29 @@ namespace FFLAS {
//return fneg(F,N,X,incX,Y,incY);

typename Field::ConstElement_ptr Xi = X;
typename Field::Element_ptr Yi=Y;
typename Field::Element_ptr Yi = Y;
for (; Xi < X+N*incX; Xi+=incX, Yi+=incY )
F.axpyin( *Yi, a, *Xi );
}

} // details
} // FFLAS

namespace FFLAS {

/***************************/
/* LEVEL 1 */
/***************************/

template<class Field>
inline void
faxpy(const Field& F, const size_t n, const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY)
{
details::faxpy(F,n,a,X,incX,Y,incY,typename FieldTraits<Field>::category());
}

template<>
inline void
faxpy( const Givaro::DoubleDomain& , const size_t N,
Expand Down Expand Up @@ -84,6 +253,11 @@ namespace FFLAS {
cblas_saxpy( (int)N, a, x, (int)incx, y, (int)incy);
}


/***************************/
/* LEVEL 2 */
/***************************/

template<class Field>
inline void
faxpy( const Field& F, const size_t m, const size_t n,
Expand Down
33 changes: 27 additions & 6 deletions fflas-ffpack/fflas/fflas_fscal.inl
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ namespace FFLAS { namespace vectorised { namespace unswitch {
}
}


template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
scalp(const Field &F, typename Field::Element_ptr T, const typename Field::Element alpha, typename Field::ConstElement_ptr U, const size_t n, const size_t & incX, HelperMod<Field> &H)
Expand All @@ -116,6 +115,19 @@ namespace FFLAS { namespace vectorised { namespace unswitch {
}
}

template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
scalp(const Field &F, typename Field::Element_ptr T, const typename Field::Element alpha, typename Field::ConstElement_ptr U, const size_t n,
const size_t & incX, const size_t & incY, HelperMod<Field> &H)
{
typename Field::ConstElement_ptr Xi = U;
typename Field::Element_ptr Yi = T;
for (; Xi < U+n*incX ; Xi+=incX,Yi+=incY)
{
*Yi = reduce(alpha*(*Xi),H);
}
}

} // unswitch
} // vectorised
} // FFLAS
Expand All @@ -141,6 +153,16 @@ namespace FFLAS { namespace vectorised {
unswitch::scalp(F,T,alpha,U,n,incX,H);
}

template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
scalp(const Field &F, typename Field::Element_ptr T, const typename Field::Element alpha, typename Field::ConstElement_ptr U, const size_t n, const size_t & incX, const size_t & incY)
{
HelperMod<Field> H(F);

unswitch::scalp(F,T,alpha,U,n,incX,incY,H);
}


} // vectorised
} // FFLAS

Expand Down Expand Up @@ -187,11 +209,10 @@ namespace FFLAS { namespace details {
if(incX == 1 && incY==1) {
vectorised::scalp(F,Y,a,X,N);
}
else {
typename Field::ConstElement_ptr Xi = X ;
typename Field::Element_ptr Yi = Y ;
for (; Xi < X+N*incX; Xi+=incX,Yi+=incY )
F.mul(*Yi, *Xi , a);
else
{
// no copy for now
vectorised::scalp(F,Y,a,X,N,incX,incY);
}
}

Expand Down