Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Jun 28, 2024
1 parent 17040e2 commit 14690a6
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 44 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ jobs:
matrix:
arch_with_features: [
# {arch: armv7, feature: none, arch_cflags: none},
{arch: aarch64, feature: none, arch_cflags: -O3},
# {arch: aarch64, feature: none, arch_cflags: none},
# {arch: aarch64, feature: crypto+crc, arch_cflags: none},
# {arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8'}
{arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8 -O3'}
]
cxx_compiler: [g++-10, clang++-11]
steps:
Expand Down
111 changes: 69 additions & 42 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -2992,11 +2992,14 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_f64(
vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#else
double *da = (double *) &a;
double *db = (double *) &b;
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
double c[2];
c[0] = da[0] + db[0];
c[1] = da[1] + db[1];
c[0] = a0.f64 + b0.f64;
c[1] = a1.f64 + b1.f64;
return vld1q_f32((float32_t *) c);
#endif
}
Expand All @@ -3010,11 +3013,13 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
#if defined(__aarch64__) || defined(_M_ARM64)
return _mm_move_sd(a, _mm_add_pd(a, b));
#else
double *da = (double *) &a;
double *db = (double *) &b;
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
double c[2];
c[0] = da[0] + db[0];
c[1] = da[1];
c[0] = a0.f64 + b0.f64;
c[1] = a1.f64;
return vld1q_f32((float32_t *) c);
#endif
}
Expand Down Expand Up @@ -3868,9 +3873,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
#else
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0 = ((double *) &rnd)[0];
double d1 = ((double *) &rnd)[1];
return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
bit64_union_t d0, d1;
d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1);
return _mm_set_epi32(0, 0, (int32_t) d1.f64, (int32_t) d0.f64);
#endif
}

Expand All @@ -3880,9 +3886,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
{
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0 = ((double *) &rnd)[0];
double d1 = ((double *) &rnd)[1];
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
bit64_union_t d0, d1;
d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1);
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0.f64, (int32_t) d1.f64};
return vreinterpret_m64_s32(vld1_s32(data));
}

Expand Down Expand Up @@ -4502,9 +4509,11 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
#if defined(__aarch64__) || defined(_M_ARM64)
return _mm_move_sd(a, _mm_max_pd(a, b));
#else
double *da = (double *) &a;
double *db = (double *) &b;
double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
double c[2] = {a0.f64 > b0.f64 ? a0.f64 : b0.f64, a1.f64};
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
#endif
}
Expand Down Expand Up @@ -4563,9 +4572,11 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
#if defined(__aarch64__) || defined(_M_ARM64)
return _mm_move_sd(a, _mm_min_pd(a, b));
#else
double *da = (double *) &a;
double *db = (double *) &b;
double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
bit64_union_t a0, a1, b0;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
double c[2] = {a0.f64 < b0.f64 ? a0.f64 : b0.f64, a1.f64};
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
#endif
}
Expand Down Expand Up @@ -4720,11 +4731,14 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_f64(
vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#else
double *da = (double *) &a;
double *db = (double *) &b;
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
double c[2];
c[0] = da[0] * db[0];
c[1] = da[1] * db[1];
c[0] = a0.f64 * b0.f64;
c[1] = a1.f64 * b1.f64;
return vld1q_f32((float32_t *) c);
#endif
}
Expand Down Expand Up @@ -5683,11 +5697,14 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_f64(
vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#else
double *da = (double *) &a;
double *db = (double *) &b;
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
double c[2];
c[0] = da[0] - db[0];
c[1] = da[1] - db[1];
c[0] = a0.f64 - b0.f64;
c[1] = a1.f64 - b1.f64;
return vld1q_f32((float32_t *) c);
#endif
}
Expand Down Expand Up @@ -5991,9 +6008,12 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
return vreinterpretq_m128d_f64(
vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#else
double *da = (double *) &a;
double *db = (double *) &b;
double c[] = {da[0] + da[1], db[0] + db[1]};
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
double c[] = {a0.f64 + a1.f64, b0.f64 + b1.f64};
return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
#endif
}
Expand All @@ -6019,17 +6039,20 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
// Horizontally subtract adjacent pairs of double-precision (64-bit)
// floating-point elements in a and b, and pack the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
{
#if defined(__aarch64__) || defined(_M_ARM64)
float64x2_t a = vreinterpretq_f64_m128d(_a);
float64x2_t b = vreinterpretq_f64_m128d(_b);
float64x2_t _a = vreinterpretq_f64_m128d(a);
float64x2_t _b = vreinterpretq_f64_m128d(b);
return vreinterpretq_m128d_f64(
vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
#else
double *da = (double *) &_a;
double *db = (double *) &_b;
double c[] = {da[0] - da[1], db[0] - db[1]};
bit64_union_t a0, a1, b0, b1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
double c[] = {a0.f64 - a1.f64, b0.f64 - b1.f64};
return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
#endif
}
Expand Down Expand Up @@ -6825,8 +6848,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
#if defined(__aarch64__) || defined(_M_ARM64)
return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
#else
double *f = (double *) &a;
return _mm_set_pd(ceil(f[1]), ceil(f[0]));
bit64_union_t a0, a1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
return _mm_set_pd(ceil(a1.f64), ceil(a0.f64));
#endif
}

Expand Down Expand Up @@ -7141,8 +7166,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
#if defined(__aarch64__) || defined(_M_ARM64)
return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
#else
double *f = (double *) &a;
return _mm_set_pd(floor(f[1]), floor(f[0]));
bit64_union_t a0, a1;
a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
return _mm_set_pd(floor(a1.f64), floor(a0.f64));
#endif
}

Expand Down

0 comments on commit 14690a6

Please sign in to comment.