Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Jul 18, 2024
1 parent 99b741e commit 382bef1
Showing 1 changed file with 39 additions and 78 deletions.
117 changes: 39 additions & 78 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -3309,11 +3309,9 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
#else
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
uint64_t d[2];
d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
Expand Down Expand Up @@ -3385,11 +3383,9 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
#else
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
uint64_t d[2];
d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
Expand Down Expand Up @@ -3434,11 +3430,9 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
#else
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
uint64_t d[2];
d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
Expand Down Expand Up @@ -3512,11 +3506,9 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_cmplt_pd(a, b));
#else
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
uint64_t d[2];
d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
Expand Down Expand Up @@ -3730,11 +3722,9 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_cmpord_pd(a, b));
#else
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
uint64_t d[2];
d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
d[1] = a1;
Expand Down Expand Up @@ -3783,11 +3773,9 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
#else
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
uint64_t d[2];
d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
d[1] = a1;
Expand All @@ -3805,10 +3793,8 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
#else
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
return a0 >= b0;
#endif
}
Expand All @@ -3822,10 +3808,8 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
#else
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));

return a0 > b0;
#endif
Expand All @@ -3840,10 +3824,8 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
#else
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));

return a0 <= b0;
#endif
Expand All @@ -3858,10 +3840,8 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
#else
double a0, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));

return a0 < b0;
#endif
Expand Down Expand Up @@ -3933,10 +3913,8 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
#else
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0, d1;
d0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
d1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
d0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
d1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
#endif
}
Expand All @@ -3948,10 +3926,8 @@ FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
{
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0, d1;
d0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
d1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
d0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
d1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
return vreinterpret_m64_s32(vld1_s32(data));
}
Expand All @@ -3967,10 +3943,8 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
#else
double a0, a1;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
return _mm_set_ps(0, 0, (float) a1, (float) a0);
#endif
}
Expand Down Expand Up @@ -4234,10 +4208,8 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
{
double a0, a1;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
}

Expand All @@ -4247,10 +4219,8 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a)
{
double a0, a1;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
return vreinterpret_m64_s32(vld1_s32(data));
}
Expand Down Expand Up @@ -4651,12 +4621,9 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_min_pd(a, b));
#else
double a0, a1, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
double c[2] = {a0 < b0 ? a0 : b0, a1};
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
#endif
Expand Down Expand Up @@ -5405,10 +5372,8 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
#else
double a0, a1;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
double _a0 = sqrt(a0);
double _a1 = sqrt(a1);
return _mm_set_pd(_a1, _a0);
Expand Down Expand Up @@ -6945,10 +6910,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
#else
double a0, a1;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
return _mm_set_pd(ceil(a1), ceil(a0));
#endif
}
Expand Down Expand Up @@ -7269,10 +7232,8 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
#else
double a0, a1;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
return _mm_set_pd(floor(a1), floor(a0));
#endif
}
Expand Down

0 comments on commit 382bef1

Please sign in to comment.