Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vfmadd #289

Merged
merged 8 commits into from
Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fflas-ffpack/fflas/fflas_fadd.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ namespace FFLAS {
struct support_simd_add<float> : public std::true_type {} ;
template<>
struct support_simd_add<double> : public std::true_type {} ;
#ifdef SIMD_INT
#ifdef SIMD_INT // why? PK - 2019/12
template<>
struct support_simd_add<int32_t> : public std::true_type {} ;
template<>
Expand Down
208 changes: 203 additions & 5 deletions fflas-ffpack/fflas/fflas_faxpy.inl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (C) 2005 Clement Pernet
*
* Written by Clement Pernet <[email protected]>
* Pierre Karpman <[email protected]>
*
*
* ========LICENCE========
Expand All @@ -27,18 +28,192 @@
#ifndef __FFLASFFPACK_faxpy_INL
#define __FFLASFFPACK_faxpy_INL

/* most of this is a shameless copy from fscal... */

namespace FFLAS { namespace vectorised { namespace unswitch {

namespace FFLAS {
#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS

template<class Field>
// TODO check if returning in register is better??
template<class Field, class SimdT>
inline void
faxpy( const Field& F, const size_t N,
VEC_FMA(typename SimdT::vect_t & A, typename SimdT::vect_t & X, typename SimdT::vect_t & Y, HelperModSimd<Field,SimdT> & H)
{
Y = SimdT::fmadd(Y,A,X);
VEC_MOD(Y, H);
}

template<class Field>
inline typename std::enable_if<FFLAS::support_simd_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, HelperMod<Field> &G)
{
typedef typename Field::Element Element;
using simd = Simd<Element>;
using vect_t = typename simd::vect_t;
HelperModSimd<Field,simd> H(F,G);
vect_t A, U, V;
A = simd::set1(a);

size_t i = 0;

if (n < simd::vect_size)
{
for (; i < n ; i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
return;
}

long st = long(Y) % simd::alignment;
if (st)
{ // the array Y is not aligned (process few elements until (Y+i) is aligned)
for (size_t j = static_cast<size_t>(st) ; j < simd::alignment ; j+=sizeof(Element), i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
}
FFLASFFPACK_check((long(Y+i) % simd::alignment == 0));
if (long(Y+i)%simd::alignment==0)
P1K marked this conversation as resolved.
Show resolved Hide resolved
{
// perform the loop using SIMD
for (;i <= n - simd::vect_size ; i += simd::vect_size)
{
U = simd::loadu(X+i); // could be transformed into a load at the cost of an additional top check. Worth it?
V = simd::load(Y+i);
VEC_FMA(A, U, V, H);
simd::store(Y+i,V);
}
}
// perform the last elt from T without SIMD
for (; i < n ; i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
}

#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS

template<class Field>
inline typename std::enable_if<!FFLAS::support_simd_mod<typename Field::Element>::value && FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, HelperMod<Field> &H)
{
size_t i = 0;

for (; i < n ; i++)
{
Y[i] = reduce(a*X[i] + Y[i], H);
}
}


template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, const size_t incX, const size_t incY,
HelperMod<Field> &H)
{
typename Field::ConstElement_ptr Xi = X;
typename Field::Element_ptr Yi=Y;
for (; Xi < X+n*incX; Xi+=incX, Yi+=incY )
*Yi = reduce(a*(*Xi) + (*Yi), H);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that the infix operators + and * are legit for this field?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should come from support_fast_mod (also implied by support_simd_mod)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. At some point, one should consider cleaning up these traits. This is #304

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This axpyp should also not be in the namespace vectorised as it does not use vectorization

}

} // unswitch
} // vectorised
} // FFLAS

namespace FFLAS { namespace vectorised {

// simd_mod => fast_mod, so this declaration's one in two
template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n)
{
HelperMod<Field> H(F);

unswitch::axpyp(F,a,X,Y,n,H);
}

template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, const size_t incX, const size_t incY)
{
HelperMod<Field> H(F);

unswitch::axpyp(F,a,X,Y,n,incX,incY,H);
}

} // vectorised
} // FFLAS

namespace FFLAS { namespace details {

/*** Entry-points for specialised code ***/
// simd_mod => fast_mod;
// will default to the best supported option in unswitch

template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
faxpy( const Field & F , const size_t N,
const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY )
typename Field::Element_ptr Y, const size_t incY,
FieldCategories::ModularTag)
{
if((incX == 1) && (incY == 1))
{
vectorised::axpyp(F,a,X,Y,N);
}
else
{
if (N < FFLASFFPACK_COPY_REDUCE) // defined in fflas_freduce.inl
{
vectorised::axpyp(F,a,X,Y,N,incX,incY);
}
else
{
typename Field::Element_ptr Xc;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Define Xc as ConstElement_ptr instead, which will remove the need for const_cast

Copy link
Member Author

@P1K P1K Mar 30, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cf. our offline discussion, the const_cast is needed for compatibility with the signature of faxpy. The proposed fix doesn't work (as is) since there is a branch where Xc is assigned

typename Field::Element_ptr Yc;
if (incX != 1)
{
Xc = fflas_new (F,N);
fassign(F,N,X,incX,Xc,1);
}
else
{
Xc = const_cast<typename Field::Element_ptr>(X); // Oh the horror
}
if (incY != 1)
{
Yc = fflas_new (F,N);
fassign(F,N,Y,incY,Yc,1);
}
else
{
Yc = Y;
}

vectorised::axpyp(F,a,Xc,Yc,N);

if (incY != 1)
{
fassign(F,N,Yc,1,Y,incY);
fflas_delete(Yc);
}
if (incX != 1)
{
fflas_delete(Xc);
}
}
}
}

template<class Field, class FC>
inline void
faxpy(const Field& F, const size_t N, const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY, FC)
{
if (F.isZero(a))
return ;

Expand All @@ -51,11 +226,29 @@ namespace FFLAS {
//return fneg(F,N,X,incX,Y,incY);

typename Field::ConstElement_ptr Xi = X;
typename Field::Element_ptr Yi=Y;
typename Field::Element_ptr Yi = Y;
for (; Xi < X+N*incX; Xi+=incX, Yi+=incY )
F.axpyin( *Yi, a, *Xi );
}

} // details
} // FFLAS

namespace FFLAS {

/***************************/
/* LEVEL 1 */
/***************************/

template<class Field>
inline void
faxpy(const Field& F, const size_t n, const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY)
{
details::faxpy(F,n,a,X,incX,Y,incY,typename FieldTraits<Field>::category());
}

template<>
inline void
faxpy( const Givaro::DoubleDomain& , const size_t N,
Expand Down Expand Up @@ -84,6 +277,11 @@ namespace FFLAS {
cblas_saxpy( (int)N, a, x, (int)incx, y, (int)incy);
}


/***************************/
/* LEVEL 2 */
/***************************/

template<class Field>
inline void
faxpy( const Field& F, const size_t m, const size_t n,
Expand Down
49 changes: 44 additions & 5 deletions fflas-ffpack/fflas/fflas_freduce.inl
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ namespace FFLAS { namespace details {
{
typename Field::Element_ptr Ac = fflas_new (F,m) ;
fassign (F,m,A,incX,Ac,1);
freduce (F,m,Ac,1,FieldCategories::ModularTag());
vectorised::modp(F,Ac,m,Ac);
fassign (F,m,Ac,1,A,incX);
fflas_delete (Ac);
}
Expand All @@ -544,10 +544,49 @@ namespace FFLAS { namespace details {
}
else
{
typename Field::Element_ptr Xi = A ;
typename Field::ConstElement_ptr Yi = B ;
for (; Xi < A+m*incX; Xi+=incX, Yi += incY )
F.reduce (*Xi , *Yi);
if (m < FFLASFFPACK_COPY_REDUCE)
{
// TODO also write a ``fast'' modp that handles an incY
typename Field::Element_ptr Xi = A ;
typename Field::ConstElement_ptr Yi = B ;
for (; Xi < A+m*incX; Xi+=incX, Yi += incY )
F.reduce (*Xi , *Yi);
}
else
{
typename Field::Element_ptr Ac;
typename Field::Element_ptr Bc;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need Bc to be Element_ptr and not ConstElement_ptr, which would then remove the need for const_cast

if (incX != 1)
{
Ac = fflas_new (F,m);
fassign(F,m,A,incX,Ac,1);
}
else
{
Ac = A;
}
if (incY != 1)
{
Bc = fflas_new (F,m);
fassign(F,m,B,incY,Bc,1);
}
else
{
Bc = const_cast<typename Field::Element_ptr>(B); // Oh the horror
}

vectorised::modp(F,Bc,m,Ac);

if (incX != 1)
{
fassign(F,m,Ac,1,A,incX);
fflas_delete(Ac);
}
if (incY != 1)
{
fflas_delete(Bc);
}
}
}
}

Expand Down
Loading