diff --git a/changelog.txt b/changelog.txt index 73c8d61..cd1a4ba 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,5 +1,7 @@ Change log for Vector class library ----------------------------------- +2022-06-03 version 2.02.01 + * minor bug fixes and updates 2022-07-20 version 2.02.00 * support half precision floating point vectors diff --git a/dispatch_example2.cpp b/dispatch_example2.cpp index 5360b41..c8e860e 100644 --- a/dispatch_example2.cpp +++ b/dispatch_example2.cpp @@ -1,7 +1,7 @@ /************************* dispatch_example2.cpp *************************** Author: Agner Fog Date created: 2012-05-30 -Last modified: 2020-02-25 +Last modified: 2023-06-03 Version: 2.02.00 Project: vector class library Description: Example of automatic CPU dispatching. @@ -49,7 +49,7 @@ clang++ -O2 -m64 -msse2 -std=c++17 dispatch_example2.cpp instrset_detect.cpp d7. # Run the program ./test.exe -(c) Copyright 2012-2022 Agner Fog. +(c) Copyright 2012-2023 Agner Fog. Apache License version 2.0 or later. ******************************************************************************/ @@ -168,7 +168,7 @@ float myfunc_dispatch(float const f[]) { // Choose which version of the entry function we want to point to: if (iset >= 10) myfunc_pointer = &Ns_AVX512::myfunc; // AVX512 version else if (iset >= 8) myfunc_pointer = &Ns_AVX2::myfunc; // AVX2 version - else if (iset >= 5) myfunc_pointer = &Ns_AVX::myfunc; // AVX version + else if (iset >= 7) myfunc_pointer = &Ns_AVX::myfunc; // AVX version else if (iset >= 2) myfunc_pointer = &Ns_SSE2::myfunc; // SSE2 version else { // Error: lowest instruction set not supported. diff --git a/instrset.h b/instrset.h index 5f646da..7f6fd3c 100644 --- a/instrset.h +++ b/instrset.h @@ -1,8 +1,8 @@ /**************************** instrset.h ********************************** * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2022-07-26 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file for various compiler-specific tasks as well as common @@ -16,7 +16,7 @@ * * For instructions, see vcl_manual.pdf * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. ******************************************************************************/ @@ -110,6 +110,7 @@ #endif #include // Define integer types with known size +#include // Define INT_MAX #include // define abs(int) diff --git a/vectorf128.h b/vectorf128.h index 5dfa96e..00c6586 100644 --- a/vectorf128.h +++ b/vectorf128.h @@ -1,8 +1,8 @@ /**************************** vectorf128.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file defining 128-bit floating point vector classes @@ -18,7 +18,7 @@ * Each vector object is represented internally in the CPU as a 128-bit register. * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -2801,7 +2801,10 @@ static inline Vec4f lookup(Vec4i const index, float const * table) { } // n > 8. Limit index Vec4ui index1; - if constexpr ((n & (n - 1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n - 1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec4ui(index) & (n - 1); } @@ -2853,7 +2856,10 @@ static inline Vec2d lookup(Vec2q const index, double const * table) { #endif // Limit index Vec2uq index1; - if constexpr ((n & (n - 1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n - 1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec2uq(index) & (n - 1); } diff --git a/vectorf256.h b/vectorf256.h index 6ce63fb..c49ab2d 100644 --- a/vectorf256.h +++ b/vectorf256.h @@ -1,8 +1,8 @@ /**************************** vectorf256.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file defining 256-bit floating point vector classes @@ -18,7 +18,7 @@ * Each vector object is represented internally in the CPU as a 256-bit register. * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -2843,7 +2843,10 @@ static inline Vec8f lookup(Vec8i const index, float const * table) { #endif // Limit index Vec8ui index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8ui(index) & (n-1); } @@ -2907,7 +2910,10 @@ static inline Vec4d lookup(Vec4q const index, double const * table) { #endif // Limit index Vec4uq index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec4uq(index) & Vec4uq(n-1); } diff --git a/vectorf256e.h b/vectorf256e.h index 70376d4..a53f176 100644 --- a/vectorf256e.h +++ b/vectorf256e.h @@ -1,8 +1,8 @@ /**************************** vectorf256e.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file defining 256-bit floating point vector classes @@ -19,7 +19,7 @@ * Each vector object is represented internally in the CPU as two 128-bit registers. * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -1827,7 +1827,10 @@ static inline Vec8f lookup(Vec8i const index, float const * table) { } // Limit index Vec8ui index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8ui(index) & (n-1); } @@ -1856,7 +1859,10 @@ static inline Vec4d lookup(Vec4q const index, double const * table) { } // Limit index Vec8ui index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = Vec8ui(index); + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8ui(index) & Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0); } diff --git a/vectorfp16.h b/vectorfp16.h index 8b07575..a64d48d 100644 --- a/vectorfp16.h +++ b/vectorfp16.h @@ -1,8 +1,8 @@ /**************************** vectorfp16.h ******************************* * Author: Agner Fog * Date created: 2022-05-03 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file defining half precision floating point vector classes @@ -23,7 +23,7 @@ * g++ version 12.1 with binutils version 2.34 * Intel c++ compiler version 2022.0 * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -687,24 +687,24 @@ static inline Vec8h change_sign(Vec8h const a) { // conversions Vec8h <-> Vec4f // extend precision: Vec8h -> Vec4f. upper half ignored -Vec4f convert8h_4f (Vec8h h) { +static inline Vec4f convert8h_4f (Vec8h h) { return _mm_cvtph_ps(_mm_castph_si128(h)); } // reduce precision: Vec4f -> Vec8h. upper half zero -Vec8h convert4f_8h (Vec4f f) { +static inline Vec8h convert4f_8h (Vec4f f) { return _mm_castsi128_ph(_mm_cvtps_ph(f, 0)); } #if MAX_VECTOR_SIZE >= 256 // conversions Vec8h <-> Vec8f // extend precision: Vec8h -> Vec8f -Vec8f to_float (Vec8h h) { +static inline Vec8f to_float (Vec8h h) { return _mm256_cvtph_ps(_mm_castph_si128(h)); } // reduce precision: Vec8f -> Vec8h -Vec8h to_float16 (Vec8f f) { +static inline Vec8h to_float16 (Vec8f f) { return _mm_castsi128_ph(_mm256_cvtps_ph(f, 0)); } #endif @@ -1308,7 +1308,7 @@ inline Vec16h pow(Vec16h const x0, uint32_t const n) { // implement as function pow(vector, const_int) template -static inline Vec16h pow(Vec16h const a, Const_int_t) { +Vec16h pow(Vec16h const a, Const_int_t) { return pow_n(a); } @@ -1422,7 +1422,7 @@ static inline Vec16h exp2(Vec16s const n) { // Each index i0 - i15 is 1 for changing sign on the corresponding element, 0 for no change template -static inline Vec16h change_sign(Vec16h const a) { +Vec16h change_sign(Vec16h const a) { if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) == 0) return a; __m256i mask = constant8ui< (i0 ? 0x8000 : 0) | (i1 ? 0x80000000 : 0), @@ -1443,12 +1443,12 @@ static inline Vec16h change_sign(Vec16h const a) { *****************************************************************************/ #if MAX_VECTOR_SIZE >= 512 // extend precision: Vec8h -> Vec8f -Vec16f to_float (Vec16h h) { +static inline Vec16f to_float (Vec16h h) { return _mm512_cvtph_ps(_mm256_castph_si256(h)); } // reduce precision: Vec8f -> Vec8h -Vec16h to_float16 (Vec16f f) { +static inline Vec16h to_float16 (Vec16f f) { return _mm256_castsi256_ph(_mm512_cvtps_ph(f, 0)); } #endif @@ -1496,7 +1496,7 @@ static inline Vec16h extend_z(Vec8h a) { // permute vector Vec16h template -static inline Vec16h permute16(Vec16h const a) { +Vec16h permute16(Vec16h const a) { return _mm256_castsi256_ph ( permute16 ( Vec16s(_mm256_castph_si256(a)))); @@ -1512,7 +1512,7 @@ static inline Vec16h permute16(Vec16h const a) { // permute and blend Vec16h template -static inline Vec16h blend16(Vec16h const a, Vec16h const b) { +Vec16h blend16(Vec16h const a, Vec16h const b) { return _mm256_castsi256_ph ( blend16 ( Vec16s(_mm256_castph_si256(a)), Vec16s(_mm256_castph_si256(b)))); @@ -1535,7 +1535,7 @@ static inline Vec16h lookup16 (Vec16s const index, Vec16h const table) { } template -static inline Vec16h lookup(Vec16s const index, void const * table) { +Vec16h lookup(Vec16s const index, void const * table) { return _mm256_castsi256_ph(lookup(index, (void const *)(table))); } @@ -2063,7 +2063,7 @@ inline Vec32h pow(Vec32h const x0, uint32_t const n) { // implement as function pow(vector, const_int) template -static inline Vec32h pow(Vec32h const a, Const_int_t) { +Vec32h pow(Vec32h const a, Const_int_t) { return pow_n(a); } @@ -2178,7 +2178,7 @@ template -static inline Vec32h change_sign(Vec32h const a) { +Vec32h change_sign(Vec32h const a) { if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15 | i16 | i17 | i18 | i19 | i20 | i21 | i22 | i23 | i24 | i25 | i26 | i27 | i28 | i29 | i30 | i31) == 0) return a; @@ -2247,7 +2247,7 @@ template -static inline Vec32h permute32(Vec32h const a) { +Vec32h permute32(Vec32h const a) { return _mm512_castsi512_ph ( permute32 ( @@ -2266,7 +2266,7 @@ template -static inline Vec32h blend32(Vec32h const a, Vec32h const b) { +Vec32h blend32(Vec32h const a, Vec32h const b) { return _mm512_castsi512_ph ( blend32 ( @@ -2307,7 +2307,7 @@ static inline Vec32h lookup(Vec32s const index, void const * table) { // pow(2,n) template -static inline V vh_pow2n (V const n) { +V vh_pow2n (V const n) { typedef decltype(roundi(n)) VI; // corresponding integer vector type const _Float16 pow2_10 = 1024.; // 2^10 const _Float16 bias = 15.; // bias in exponent @@ -2355,7 +2355,7 @@ inline Vec32h infinite_vech() { // BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x) template -static inline VTYPE exp_h(VTYPE const initial_x) { +VTYPE exp_h(VTYPE const initial_x) { // Taylor coefficients const _Float16 P0expf = 1.f/2.f; @@ -2444,7 +2444,7 @@ static inline Vec32us unsigned_int_type(Vec32h) { return 0; } // xx = input x (radians) // cosret = return pointer (only if SC = 3) template -static inline VTYPE sincos_h(VTYPE * cosret, VTYPE const xx) { +VTYPE sincos_h(VTYPE * cosret, VTYPE const xx) { // define constants const _Float16 dp1h = 1.57031250f; // pi/2 with lower bits of mantissa removed diff --git a/vectorfp16e.h b/vectorfp16e.h index 3de1411..f7a0b93 100644 --- a/vectorfp16e.h +++ b/vectorfp16e.h @@ -1,8 +1,8 @@ /**************************** vectorfp16e.h ******************************* * Author: Agner Fog * Date created: 2022-05-03 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file emulating half precision floating point vector classes @@ -17,7 +17,7 @@ * * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -350,19 +350,19 @@ class Vec8h { #ifdef __F16C__ // F16C instruction set has conversion instructions // extend precision: Vec8h -> Vec4f. upper half ignored -Vec4f convert8h_4f (Vec8h h) { +static inline Vec4f convert8h_4f (Vec8h h) { return _mm_cvtph_ps(h); } // reduce precision: Vec4f -> Vec8h. upper half zero -Vec8h convert4f_8h (Vec4f f) { +static inline Vec8h convert4f_8h (Vec4f f) { return _mm_cvtps_ph(f, 0); } #else // extend precision: Vec8h -> Vec4f. upper half ignored -Vec4f convert8h_4f (Vec8h x) { +static Vec4f convert8h_4f (Vec8h x) { // __m128i a = _mm_cvtepu16_epi32(x); // SSE4.1 __m128i a = _mm_unpacklo_epi16(x, _mm_setzero_si128 ()); // zero extend __m128i b = _mm_slli_epi32(a, 16); // left-justify @@ -387,7 +387,7 @@ Vec4f convert8h_4f (Vec8h x) { } // reduce precision: Vec4f -> Vec8h. upper half zero -Vec8h convert4f_8h (Vec4f x) { +static Vec8h convert4f_8h (Vec4f x) { __m128i a = _mm_castps_si128(x); // bit-cast to integer // 23 bit mantissa rounded to 10 bits - nearest or even __m128i r = _mm_srli_epi32(a, 12); // get first discarded mantissa bit @@ -437,12 +437,12 @@ Vec8h convert4f_8h (Vec4f x) { #if defined (__F16C__) && INSTRSET >= 8 // F16C instruction set has conversion instructions // extend precision: Vec8h -> Vec8f -Vec8f to_float (Vec8h h) { +static inline Vec8f to_float (Vec8h h) { return _mm256_cvtph_ps(h); } // reduce precision: Vec8f -> Vec8h -Vec8h to_float16 (Vec8f f) { +static inline Vec8h to_float16 (Vec8f f) { return _mm256_cvtps_ph(f, 0); } @@ -1115,7 +1115,7 @@ static inline Vec8h exp2(Vec8s const n) { // change signs on vectors Vec8h // Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change template -static inline Vec8h change_sign(Vec8h const a) { +Vec8h change_sign(Vec8h const a) { if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a; __m128i mask = constant4ui< (i0 ? 0x8000 : 0) | (i1 ? 0x80000000 : 0), @@ -1155,7 +1155,7 @@ static inline __m128i reinterpret_h(__m128i const x) { *****************************************************************************/ // permute vector Vec8h template -static inline Vec8h permute8(Vec8h const a) { +Vec8h permute8(Vec8h const a) { return __m128i(permute8(Vec8s(__m128i(a)))); } @@ -1193,7 +1193,7 @@ static inline Vec8h lookup16(Vec8s const index, Vec8h const table0, Vec8h const } template -static inline Vec8h lookup(Vec8s const index, void const * table) { +Vec8h lookup(Vec8s const index, void const * table) { return __m128i(lookup(index, (void const *)(table))); } @@ -1927,7 +1927,7 @@ static inline Vec16h exp2(Vec16s const n) { // Each index i0 - i15 is 1 for changing sign on the corresponding element, 0 for no change template -static inline Vec16h change_sign(Vec16h const a) { +Vec16h change_sign(Vec16h const a) { #if INSTRSET >= 8 if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) == 0) return a; __m256i mask = constant8ui< @@ -1961,7 +1961,7 @@ static inline Vec16h change_sign(Vec16h const a) { // permute vector Vec16h template -static inline Vec16h permute16(Vec16h const a) { +Vec16h permute16(Vec16h const a) { return reinterpret_h ( permute16 ( Vec16s(reinterpret_i(a)))); diff --git a/vectori128.h b/vectori128.h index ac6aa0c..b19c4d7 100644 --- a/vectori128.h +++ b/vectori128.h @@ -1,8 +1,8 @@ /**************************** vectori128.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file defining 128-bit integer vector classes @@ -27,7 +27,7 @@ * Each vector object is represented internally in the CPU as a 128-bit register. * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -5547,7 +5547,10 @@ static inline Vec16c lookup(Vec16c const index, void const * table) { if constexpr (n <= 32) return lookup32(index, Vec16c().load(table), Vec16c().load((int8_t*)table + 16)); // n > 32. Limit index Vec16uc index1; - if constexpr ((n & (n - 1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n - 1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec16uc(index) & uint8_t(n - 1); } @@ -5596,7 +5599,10 @@ static inline Vec8s lookup(Vec8s const index, void const * table) { if constexpr (n <= 16) return lookup16(index, Vec8s().load(table), Vec8s().load((int16_t*)table + 8)); // n > 16. Limit index Vec8us index1; - if constexpr ((n & (n - 1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n - 1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8us(index) & (n - 1); } @@ -5682,7 +5688,10 @@ static inline Vec4i lookup(Vec4i const index, void const * table) { if constexpr (n <= 8) return lookup8(index, Vec4i().load(table), Vec4i().load((int32_t*)table + 4)); // n > 8. Limit index Vec4ui index1; - if constexpr ((n & (n - 1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n - 1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec4ui(index) & (n - 1); } @@ -5714,7 +5723,10 @@ static inline Vec2q lookup(Vec2q const index, void const * table) { if constexpr (n <= 0) return 0; // n > 0. Limit index Vec2uq index1; - if constexpr ((n & (n - 1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n - 1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec2uq(index) & (n - 1); } diff --git a/vectori256.h b/vectori256.h index 134cb21..2ffd61c 100644 --- a/vectori256.h +++ b/vectori256.h @@ -1,8 +1,8 @@ /**************************** vectori256.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file defining integer vector classes as interface to intrinsic @@ -28,7 +28,7 @@ * Each vector object is represented internally in the CPU as a 256-bit register. * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -4678,7 +4678,10 @@ static inline Vec32c lookup(Vec32uc const index, void const * table) { if constexpr (n <= 32) return lookup32(index, Vec32c().load(table)); // n > 32. Limit index Vec32uc index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec32uc(index) & uint8_t(n-1); } @@ -4720,7 +4723,10 @@ static inline Vec16s lookup(Vec16s const index, void const * table) { if constexpr (n <= 16) return lookup16(index, Vec16s().load(table)); // n > 16. Limit index Vec16us index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec16us(index) & (n-1); } @@ -4754,7 +4760,10 @@ static inline Vec8i lookup(Vec8i const index, void const * table) { } // n > 16. Limit index Vec8ui index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8ui(index) & (n-1); } @@ -4774,7 +4783,10 @@ static inline Vec4q lookup(Vec4q const index, int64_t const * table) { if constexpr (n <= 0) return 0; // n > 0. Limit index Vec4uq index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = Vec4uq(index); + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec4uq(index) & (n-1); } diff --git a/vectori256e.h b/vectori256e.h index 90a3ea1..4b9eff9 100644 --- a/vectori256e.h +++ b/vectori256e.h @@ -1,8 +1,8 @@ /**************************** vectori256e.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file defining 256-bit integer point vector classes as interface @@ -28,7 +28,7 @@ * Each vector object is represented internally in the CPU as two 128-bit registers. * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -3286,7 +3286,10 @@ static inline Vec32c lookup(Vec32uc const index, void const * table) { if constexpr (n <= 32) return lookup32(index, Vec32c().load(table)); // n > 32. Limit index Vec32uc index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec32uc(index) & uint8_t(n-1); } @@ -3325,7 +3328,10 @@ static inline Vec16s lookup(Vec16s const index, void const * table) { if constexpr (n <= 16) return lookup16(index, Vec16s().load(table)); // n > 16. Limit index Vec16us i1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + i1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n i1 = Vec16us(index) & (n-1); } @@ -3358,7 +3364,10 @@ static inline Vec8i lookup(Vec8i const index, void const * table) { } // n > 8. Limit index Vec8ui i1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + i1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n i1 = Vec8ui(index) & (n-1); } @@ -3379,7 +3388,10 @@ static inline Vec4q lookup(Vec4q const index, void const * table) { if constexpr (n <= 0) return 0; // n > 0. Limit index Vec4uq index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec4uq(index) & (n-1); } diff --git a/vectori512.h b/vectori512.h index 6c6698e..97ec144 100644 --- a/vectori512.h +++ b/vectori512.h @@ -1,8 +1,8 @@ /**************************** vectori512.h ******************************* * Author: Agner Fog * Date created: 2014-07-23 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector class library * Description: * Header file defining 512-bit integer vector classes for 32 and 64 bit integers. @@ -22,7 +22,7 @@ * Each vector object is represented internally in the CPU as a 512-bit register. * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -1676,7 +1676,10 @@ static inline Vec16i lookup(Vec16i const index, void const * table) { } // n > 32. Limit index Vec16ui index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec16ui(index) & (n-1); } @@ -1707,7 +1710,10 @@ static inline Vec8q lookup(Vec8q const index, void const * table) { } // n > 16. Limit index Vec8uq index1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + index1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8uq(index) & (n-1); } diff --git a/vectori512e.h b/vectori512e.h index 7e3569a..1e04df1 100644 --- a/vectori512e.h +++ b/vectori512e.h @@ -1,8 +1,8 @@ /**************************** vectori512e.h ******************************* * Author: Agner Fog * Date created: 2014-07-23 -* Last modified: 2022-07-20 -* Version: 2.02.00 +* Last modified: 2023-06-03 +* Version: 2.02.01 * Project: vector classes * Description: * Header file defining 512-bit integer vector classes for 32 and 64 bit integers. @@ -21,7 +21,7 @@ * Each vector object is represented internally in the CPU as two 256-bit registers. * This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2022 Agner Fog. +* (c) Copyright 2012-2023 Agner Fog. * Apache License version 2.0 or later. *****************************************************************************/ @@ -1847,7 +1847,10 @@ static inline Vec16i lookup(Vec16i const index, void const * table) { if constexpr (n <= 16) return lookup16(index, Vec16i().load(table)); // n > 16. Limit index Vec16ui i1; - if constexpr ((n & (n - 1)) == 0) { + if constexpr (n == INT_MAX) { + i1 = index; + } + else if constexpr ((n & (n - 1)) == 0) { // n is a power of 2, make index modulo n i1 = Vec16ui(index) & (n - 1); } @@ -1899,7 +1902,10 @@ static inline Vec8q lookup(Vec8q const index, void const * table) { } // n > 8. Limit index Vec8uq i1; - if constexpr ((n & (n-1)) == 0) { + if constexpr (n == INT_MAX) { + i1 = index; + } + else if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n i1 = Vec8uq(index) & (n-1); }