Skip to content

Commit

Permalink
wip:
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Jul 18, 2024
1 parent 382bef1 commit e14d9b8
Showing 1 changed file with 26 additions and 30 deletions.
56 changes: 26 additions & 30 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -3020,12 +3020,9 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_add_pd(a, b));
#else
double a0, a1, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
double c[2];
c[0] = a0 + b0;
c[1] = a1;
Expand Down Expand Up @@ -3913,8 +3910,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
#else
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0, d1;
d0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
d1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
d0 = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
d1 = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
#endif
}
Expand All @@ -3926,8 +3925,10 @@ FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
{
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0, d1;
d0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
d1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
d0 = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
d1 = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
return vreinterpret_m64_s32(vld1_s32(data));
}
Expand Down Expand Up @@ -4059,8 +4060,8 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
#else
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double ret =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
double ret = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
return (int32_t) ret;
#endif
}
Expand All @@ -4074,8 +4075,8 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
#else
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double ret =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
double ret = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
return (int64_t) ret;
#endif
}
Expand Down Expand Up @@ -4197,8 +4198,8 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
return vreinterpretq_m128d_f64(
vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
#else
return vreinterpretq_m128d_s64(
vsetq_lane_s64(sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0));
return vreinterpretq_m128d_s64(vsetq_lane_s64(
sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0));
#endif
}

Expand Down Expand Up @@ -4552,12 +4553,9 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_max_pd(a, b));
#else
double a0, a1, b0;
a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
double c[2] = {a0 > b0 ? a0 : b0, a1};
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
#endif
Expand Down Expand Up @@ -5390,10 +5388,8 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
return _mm_move_sd(a, _mm_sqrt_pd(b));
#else
double _a, _b;
_a =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
_b =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
_a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
_b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
return _mm_set_pd(_a, sqrt(_b));
#endif
}
Expand Down Expand Up @@ -7137,10 +7133,10 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
#if defined(__aarch64__) || defined(_M_ARM64)
double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
#else
double _tmp0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
double _tmp1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
double _tmp0 = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
double _tmp1 = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
double sum = _tmp0 + _tmp1;
#endif
// Conditionally store the sum
Expand Down

0 comments on commit e14d9b8

Please sign in to comment.