From fe6c4506c6e8301d2edd884284d69663ec2b68a1 Mon Sep 17 00:00:00 2001
From: Agner Fog <AgnerF@users.noreply.github.com>
Date: Sat, 3 Jun 2023 11:11:13 +0200
Subject: [PATCH] Add files via upload

Version 2.02.01
---
 changelog.txt         |  2 ++
 dispatch_example2.cpp |  6 +++---
 instrset.h            |  7 ++++---
 vectorf128.h          | 16 +++++++++++-----
 vectorf256.h          | 16 +++++++++++-----
 vectorf256e.h         | 16 +++++++++++-----
 vectorfp16.h          | 42 +++++++++++++++++++++---------------------
 vectorfp16e.h         | 28 ++++++++++++++--------------
 vectori128.h          | 26 +++++++++++++++++++-------
 vectori256.h          | 26 +++++++++++++++++++-------
 vectori256e.h         | 26 +++++++++++++++++++-------
 vectori512.h          | 16 +++++++++++-----
 vectori512e.h         | 16 +++++++++++-----
 13 files changed, 156 insertions(+), 87 deletions(-)

diff --git a/changelog.txt b/changelog.txt
index 73c8d61..cd1a4ba 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,5 +1,7 @@
 Change log for Vector class library
 -----------------------------------
+2022-06-03 version 2.02.01
+  * minor bug fixes and updates
 
 2022-07-20 version 2.02.00
   * support half precision floating point vectors
diff --git a/dispatch_example2.cpp b/dispatch_example2.cpp
index 5360b41..c8e860e 100644
--- a/dispatch_example2.cpp
+++ b/dispatch_example2.cpp
@@ -1,7 +1,7 @@
 /*************************  dispatch_example2.cpp   ***************************
 Author:        Agner Fog
 Date created:  2012-05-30
-Last modified: 2020-02-25
+Last modified: 2023-06-03
 Version:       2.02.00
 Project:       vector class library
 Description:   Example of automatic CPU dispatching.
@@ -49,7 +49,7 @@ clang++ -O2 -m64 -msse2 -std=c++17 dispatch_example2.cpp instrset_detect.cpp d7.
 # Run the program
 ./test.exe
 
-(c) Copyright 2012-2022 Agner Fog.
+(c) Copyright 2012-2023 Agner Fog.
 Apache License version 2.0 or later.
 ******************************************************************************/
 
@@ -168,7 +168,7 @@ float myfunc_dispatch(float const f[]) {
     // Choose which version of the entry function we want to point to:
     if      (iset >= 10) myfunc_pointer = &Ns_AVX512::myfunc;  // AVX512 version
     else if (iset >=  8) myfunc_pointer = &Ns_AVX2::myfunc;    // AVX2 version
-    else if (iset >=  5) myfunc_pointer = &Ns_AVX::myfunc;     // AVX version
+    else if (iset >=  7) myfunc_pointer = &Ns_AVX::myfunc;     // AVX version
     else if (iset >=  2) myfunc_pointer = &Ns_SSE2::myfunc;    // SSE2 version
     else {
         // Error: lowest instruction set not supported.
diff --git a/instrset.h b/instrset.h
index 5f646da..7f6fd3c 100644
--- a/instrset.h
+++ b/instrset.h
@@ -1,8 +1,8 @@
 /****************************  instrset.h   **********************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2022-07-26
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file for various compiler-specific tasks as well as common
@@ -16,7 +16,7 @@
 *
 * For instructions, see vcl_manual.pdf
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 ******************************************************************************/
 
@@ -110,6 +110,7 @@
 #endif
 
 #include <stdint.h>                    // Define integer types with known size
+#include <limits.h>                    // Define INT_MAX
 #include <stdlib.h>                    // define abs(int)
 
 
diff --git a/vectorf128.h b/vectorf128.h
index 5dfa96e..00c6586 100644
--- a/vectorf128.h
+++ b/vectorf128.h
@@ -1,8 +1,8 @@
 /****************************  vectorf128.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file defining 128-bit floating point vector classes
@@ -18,7 +18,7 @@
 * Each vector object is represented internally in the CPU as a 128-bit register.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -2801,7 +2801,10 @@ static inline Vec4f lookup(Vec4i const index, float const * table) {
     }
     // n > 8. Limit index
     Vec4ui index1;
-    if constexpr ((n & (n - 1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n - 1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec4ui(index) & (n - 1);
     }
@@ -2853,7 +2856,10 @@ static inline Vec2d lookup(Vec2q const index, double const * table) {
 #endif
     // Limit index
     Vec2uq index1;
-    if constexpr ((n & (n - 1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n - 1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec2uq(index) & (n - 1);
     }
diff --git a/vectorf256.h b/vectorf256.h
index 6ce63fb..c49ab2d 100644
--- a/vectorf256.h
+++ b/vectorf256.h
@@ -1,8 +1,8 @@
 /****************************  vectorf256.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file defining 256-bit floating point vector classes
@@ -18,7 +18,7 @@
 * Each vector object is represented internally in the CPU as a 256-bit register.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -2843,7 +2843,10 @@ static inline Vec8f lookup(Vec8i const index, float const * table) {
 #endif
     // Limit index
     Vec8ui index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec8ui(index) & (n-1);
     }
@@ -2907,7 +2910,10 @@ static inline Vec4d lookup(Vec4q const index, double const * table) {
 #endif
     // Limit index
     Vec4uq index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec4uq(index) & Vec4uq(n-1);
     }
diff --git a/vectorf256e.h b/vectorf256e.h
index 70376d4..a53f176 100644
--- a/vectorf256e.h
+++ b/vectorf256e.h
@@ -1,8 +1,8 @@
 /****************************  vectorf256e.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file defining 256-bit floating point vector classes
@@ -19,7 +19,7 @@
 * Each vector object is represented internally in the CPU as two 128-bit registers.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -1827,7 +1827,10 @@ static inline Vec8f lookup(Vec8i const index, float const * table) {
     }
     // Limit index
     Vec8ui index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec8ui(index) & (n-1);
     }
@@ -1856,7 +1859,10 @@ static inline Vec4d lookup(Vec4q const index, double const * table) {
     }
     // Limit index
     Vec8ui index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = Vec8ui(index);
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec8ui(index) & Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0);
     }
diff --git a/vectorfp16.h b/vectorfp16.h
index 8b07575..a64d48d 100644
--- a/vectorfp16.h
+++ b/vectorfp16.h
@@ -1,8 +1,8 @@
 /****************************  vectorfp16.h   *******************************
 * Author:        Agner Fog
 * Date created:  2022-05-03
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file defining half precision floating point vector classes
@@ -23,7 +23,7 @@
 * g++ version 12.1 with binutils version 2.34
 * Intel c++ compiler version 2022.0
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -687,24 +687,24 @@ static inline Vec8h change_sign(Vec8h const a) {
 
 // conversions Vec8h <-> Vec4f
 // extend precision: Vec8h -> Vec4f. upper half ignored
-Vec4f convert8h_4f (Vec8h h) {
+static inline Vec4f convert8h_4f (Vec8h h) {
     return _mm_cvtph_ps(_mm_castph_si128(h));
 }
 
 // reduce precision: Vec4f -> Vec8h. upper half zero
-Vec8h convert4f_8h (Vec4f f) {
+static inline Vec8h convert4f_8h (Vec4f f) {
     return _mm_castsi128_ph(_mm_cvtps_ph(f, 0));
 }
 
 #if MAX_VECTOR_SIZE >= 256
 // conversions Vec8h <-> Vec8f
 // extend precision: Vec8h -> Vec8f
-Vec8f to_float (Vec8h h) {
+static inline Vec8f to_float (Vec8h h) {
     return _mm256_cvtph_ps(_mm_castph_si128(h));
 }
 
 // reduce precision: Vec8f -> Vec8h
-Vec8h to_float16 (Vec8f f) {
+static inline Vec8h to_float16 (Vec8f f) {
     return _mm_castsi128_ph(_mm256_cvtps_ph(f, 0));
 } 
 #endif
@@ -1308,7 +1308,7 @@ inline Vec16h pow<uint32_t>(Vec16h const x0, uint32_t const n) {
 
 // implement as function pow(vector, const_int)
 template <int n>
-static inline Vec16h pow(Vec16h const a, Const_int_t<n>) {
+Vec16h pow(Vec16h const a, Const_int_t<n>) {
     return pow_n<Vec16h, n>(a);
 }
 
@@ -1422,7 +1422,7 @@ static inline Vec16h exp2(Vec16s const n) {
 // Each index i0 - i15 is 1 for changing sign on the corresponding element, 0 for no change
 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
-static inline Vec16h change_sign(Vec16h const a) {
+Vec16h change_sign(Vec16h const a) {
     if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) == 0) return a;
     __m256i mask = constant8ui<
         (i0  ? 0x8000 : 0) | (i1  ? 0x80000000 : 0), 
@@ -1443,12 +1443,12 @@ static inline Vec16h change_sign(Vec16h const a) {
 *****************************************************************************/
 #if MAX_VECTOR_SIZE >= 512
 // extend precision: Vec8h -> Vec8f
-Vec16f to_float (Vec16h h) {
+static inline Vec16f to_float (Vec16h h) {
     return _mm512_cvtph_ps(_mm256_castph_si256(h));
 }
 
 // reduce precision: Vec8f -> Vec8h
-Vec16h to_float16 (Vec16f f) {
+static inline Vec16h to_float16 (Vec16f f) {
     return _mm256_castsi256_ph(_mm512_cvtps_ph(f, 0));
 }
 #endif
@@ -1496,7 +1496,7 @@ static inline Vec16h extend_z(Vec8h a) {
 // permute vector Vec16h
 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
-static inline Vec16h permute16(Vec16h const a) {
+Vec16h permute16(Vec16h const a) {
     return _mm256_castsi256_ph (
     permute16<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (
     Vec16s(_mm256_castph_si256(a))));
@@ -1512,7 +1512,7 @@ static inline Vec16h permute16(Vec16h const a) {
 // permute and blend Vec16h
 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
-static inline Vec16h blend16(Vec16h const a, Vec16h const b) {
+Vec16h blend16(Vec16h const a, Vec16h const b) {
     return _mm256_castsi256_ph (
     blend16<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (
     Vec16s(_mm256_castph_si256(a)), Vec16s(_mm256_castph_si256(b))));
@@ -1535,7 +1535,7 @@ static inline Vec16h lookup16 (Vec16s const index, Vec16h const table) {
 }
 
 template <int n>
-static inline Vec16h lookup(Vec16s const index, void const * table) {
+Vec16h lookup(Vec16s const index, void const * table) {
     return _mm256_castsi256_ph(lookup<n>(index, (void const *)(table)));
 }
 
@@ -2063,7 +2063,7 @@ inline Vec32h pow<uint32_t>(Vec32h const x0, uint32_t const n) {
 
 // implement as function pow(vector, const_int)
 template <int n>
-static inline Vec32h pow(Vec32h const a, Const_int_t<n>) {
+Vec32h pow(Vec32h const a, Const_int_t<n>) {
     return pow_n<Vec32h, n>(a);
 }
 
@@ -2178,7 +2178,7 @@ template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
 int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
 int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
-static inline Vec32h change_sign(Vec32h const a) {
+Vec32h change_sign(Vec32h const a) {
     if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15 |
     i16 | i17 | i18 | i19 | i20 | i21 | i22 | i23 | i24 | i25 | i26 | i27 | i28 | i29 | i30 | i31)
     == 0) return a;
@@ -2247,7 +2247,7 @@ template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
 int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
 int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
-static inline Vec32h permute32(Vec32h const a) {
+Vec32h permute32(Vec32h const a) {
     return _mm512_castsi512_ph (
     permute32<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
     i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 > (
@@ -2266,7 +2266,7 @@ template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
 int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
 int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
-static inline Vec32h blend32(Vec32h const a, Vec32h const b) {
+Vec32h blend32(Vec32h const a, Vec32h const b) {
     return _mm512_castsi512_ph (
     blend32<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
     i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 > (
@@ -2307,7 +2307,7 @@ static inline Vec32h lookup(Vec32s const index, void const * table) {
 
 // pow(2,n)
 template <typename V>
-static inline V vh_pow2n (V const n) {           
+V vh_pow2n (V const n) {           
     typedef decltype(roundi(n)) VI;              // corresponding integer vector type
     const _Float16 pow2_10 =  1024.;             // 2^10
     const _Float16 bias = 15.;                   // bias in exponent
@@ -2355,7 +2355,7 @@ inline Vec32h infinite_vech<Vec32h>() {
 // BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
 
 template<typename VTYPE, int M1, int BA>
-static inline VTYPE exp_h(VTYPE const initial_x) {
+VTYPE exp_h(VTYPE const initial_x) {
 
     // Taylor coefficients
     const _Float16 P0expf   =  1.f/2.f;
@@ -2444,7 +2444,7 @@ static inline Vec32us unsigned_int_type(Vec32h) { return 0; }
 // xx = input x (radians)
 // cosret = return pointer (only if SC = 3)
 template<typename VTYPE, int SC>
-static inline VTYPE sincos_h(VTYPE * cosret, VTYPE const xx) {
+VTYPE sincos_h(VTYPE * cosret, VTYPE const xx) {
 
     // define constants
     const _Float16 dp1h = 1.57031250f;           // pi/2 with lower bits of mantissa removed
diff --git a/vectorfp16e.h b/vectorfp16e.h
index 3de1411..f7a0b93 100644
--- a/vectorfp16e.h
+++ b/vectorfp16e.h
@@ -1,8 +1,8 @@
 /****************************  vectorfp16e.h   *******************************
 * Author:        Agner Fog
 * Date created:  2022-05-03
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file emulating half precision floating point vector classes
@@ -17,7 +17,7 @@
 *
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -350,19 +350,19 @@ class Vec8h {
 #ifdef __F16C__    // F16C instruction set has conversion instructions
 
 // extend precision: Vec8h -> Vec4f. upper half ignored
-Vec4f convert8h_4f (Vec8h h) {
+static inline Vec4f convert8h_4f (Vec8h h) {
     return _mm_cvtph_ps(h);
 }
 
 // reduce precision: Vec4f -> Vec8h. upper half zero
-Vec8h convert4f_8h (Vec4f f) {
+static inline Vec8h convert4f_8h (Vec4f f) {
     return _mm_cvtps_ph(f, 0);
 }
 
 #else
 
 // extend precision: Vec8h -> Vec4f. upper half ignored
-Vec4f convert8h_4f (Vec8h x) {
+static Vec4f convert8h_4f (Vec8h x) {
     // __m128i a = _mm_cvtepu16_epi32(x);                            // SSE4.1
     __m128i a = _mm_unpacklo_epi16(x, _mm_setzero_si128 ());         // zero extend
     __m128i b = _mm_slli_epi32(a, 16);                               // left-justify
@@ -387,7 +387,7 @@ Vec4f convert8h_4f (Vec8h x) {
 }
 
 // reduce precision: Vec4f -> Vec8h. upper half zero
-Vec8h convert4f_8h (Vec4f x) {
+static Vec8h convert4f_8h (Vec4f x) {
     __m128i a = _mm_castps_si128(x);                                 // bit-cast to integer
     // 23 bit mantissa rounded to 10 bits - nearest or even
     __m128i r = _mm_srli_epi32(a, 12);                               // get first discarded mantissa bit
@@ -437,12 +437,12 @@ Vec8h convert4f_8h (Vec4f x) {
 #if defined (__F16C__) && INSTRSET >= 8  // F16C instruction set has conversion instructions
 
 // extend precision: Vec8h -> Vec8f
-Vec8f to_float (Vec8h h) {
+static inline Vec8f to_float (Vec8h h) {
     return _mm256_cvtph_ps(h);
 }
 
 // reduce precision: Vec8f -> Vec8h
-Vec8h to_float16 (Vec8f f) {
+static inline Vec8h to_float16 (Vec8f f) {
     return _mm256_cvtps_ph(f, 0);
 }
 
@@ -1115,7 +1115,7 @@ static inline Vec8h exp2(Vec8s const n) {
 // change signs on vectors Vec8h
 // Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-static inline Vec8h change_sign(Vec8h const a) {
+Vec8h change_sign(Vec8h const a) {
     if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
     __m128i mask = constant4ui<
         (i0 ? 0x8000 : 0) | (i1 ? 0x80000000 : 0), 
@@ -1155,7 +1155,7 @@ static inline __m128i reinterpret_h(__m128i const x) {
 *****************************************************************************/
 // permute vector Vec8h
 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-static inline Vec8h permute8(Vec8h const a) {
+Vec8h permute8(Vec8h const a) {
     return __m128i(permute8<i0, i1, i2, i3, i4, i5, i6, i7>(Vec8s(__m128i(a))));
 }
 
@@ -1193,7 +1193,7 @@ static inline Vec8h lookup16(Vec8s const index, Vec8h const table0, Vec8h const
 }
 
 template <int n>
-static inline Vec8h lookup(Vec8s const index, void const * table) {
+Vec8h lookup(Vec8s const index, void const * table) {
     return __m128i(lookup<n>(index, (void const *)(table)));
 }
 
@@ -1927,7 +1927,7 @@ static inline Vec16h exp2(Vec16s const n) {
 // Each index i0 - i15 is 1 for changing sign on the corresponding element, 0 for no change
 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
-static inline Vec16h change_sign(Vec16h const a) {
+Vec16h change_sign(Vec16h const a) {
 #if INSTRSET >= 8
     if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) == 0) return a;
     __m256i mask = constant8ui<
@@ -1961,7 +1961,7 @@ static inline Vec16h change_sign(Vec16h const a) {
 // permute vector Vec16h
 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
-static inline Vec16h permute16(Vec16h const a) {
+Vec16h permute16(Vec16h const a) {
     return reinterpret_h (
     permute16<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (
     Vec16s(reinterpret_i(a))));
diff --git a/vectori128.h b/vectori128.h
index ac6aa0c..b19c4d7 100644
--- a/vectori128.h
+++ b/vectori128.h
@@ -1,8 +1,8 @@
 /****************************  vectori128.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file defining 128-bit integer vector classes
@@ -27,7 +27,7 @@
 * Each vector object is represented internally in the CPU as a 128-bit register.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -5547,7 +5547,10 @@ static inline Vec16c lookup(Vec16c const index, void const * table) {
     if constexpr (n <= 32) return lookup32(index, Vec16c().load(table), Vec16c().load((int8_t*)table + 16));
     // n > 32. Limit index
     Vec16uc index1;
-    if constexpr ((n & (n - 1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n - 1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec16uc(index) & uint8_t(n - 1);
     }
@@ -5596,7 +5599,10 @@ static inline Vec8s lookup(Vec8s const index, void const * table) {
     if constexpr (n <= 16) return lookup16(index, Vec8s().load(table), Vec8s().load((int16_t*)table + 8));
     // n > 16. Limit index
     Vec8us index1;
-    if constexpr ((n & (n - 1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n - 1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec8us(index) & (n - 1);
     }
@@ -5682,7 +5688,10 @@ static inline Vec4i lookup(Vec4i const index, void const * table) {
     if constexpr (n <= 8) return lookup8(index, Vec4i().load(table), Vec4i().load((int32_t*)table + 4));
     // n > 8. Limit index
     Vec4ui index1;
-    if constexpr ((n & (n - 1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n - 1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec4ui(index) & (n - 1);
     }
@@ -5714,7 +5723,10 @@ static inline Vec2q lookup(Vec2q const index, void const * table) {
     if constexpr (n <= 0) return 0;
     // n > 0. Limit index
     Vec2uq index1;
-    if constexpr ((n & (n - 1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n - 1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec2uq(index) & (n - 1);
     }
diff --git a/vectori256.h b/vectori256.h
index 134cb21..2ffd61c 100644
--- a/vectori256.h
+++ b/vectori256.h
@@ -1,8 +1,8 @@
 /****************************  vectori256.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file defining integer vector classes as interface to intrinsic
@@ -28,7 +28,7 @@
 * Each vector object is represented internally in the CPU as a 256-bit register.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -4678,7 +4678,10 @@ static inline Vec32c lookup(Vec32uc const index, void const * table) {
     if constexpr (n <= 32) return lookup32(index, Vec32c().load(table));
     // n > 32. Limit index
     Vec32uc index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec32uc(index) & uint8_t(n-1);
     }
@@ -4720,7 +4723,10 @@ static inline Vec16s lookup(Vec16s const index, void const * table) {
     if constexpr (n <= 16) return lookup16(index, Vec16s().load(table));
     // n > 16. Limit index
     Vec16us index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec16us(index) & (n-1);
     }
@@ -4754,7 +4760,10 @@ static inline Vec8i lookup(Vec8i const index, void const * table) {
     }
     // n > 16. Limit index
     Vec8ui index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec8ui(index) & (n-1);
     }
@@ -4774,7 +4783,10 @@ static inline Vec4q lookup(Vec4q const index, int64_t const * table) {
     if constexpr (n <= 0) return 0;
     // n > 0. Limit index
     Vec4uq index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = Vec4uq(index);
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec4uq(index) & (n-1);
     }
diff --git a/vectori256e.h b/vectori256e.h
index 90a3ea1..4b9eff9 100644
--- a/vectori256e.h
+++ b/vectori256e.h
@@ -1,8 +1,8 @@
 /****************************  vectori256e.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file defining 256-bit integer point vector classes as interface
@@ -28,7 +28,7 @@
 * Each vector object is represented internally in the CPU as two 128-bit registers.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -3286,7 +3286,10 @@ static inline Vec32c lookup(Vec32uc const index, void const * table) {
     if constexpr (n <= 32) return lookup32(index, Vec32c().load(table));
     // n > 32. Limit index
     Vec32uc index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec32uc(index) & uint8_t(n-1);
     }
@@ -3325,7 +3328,10 @@ static inline Vec16s lookup(Vec16s const index, void const * table) {
     if constexpr (n <= 16) return lookup16(index, Vec16s().load(table));
     // n > 16. Limit index
     Vec16us i1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        i1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         i1 = Vec16us(index) & (n-1);
     }
@@ -3358,7 +3364,10 @@ static inline Vec8i lookup(Vec8i const index, void const * table) {
     }
     // n > 8. Limit index
     Vec8ui i1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        i1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         i1 = Vec8ui(index) & (n-1);
     }
@@ -3379,7 +3388,10 @@ static inline Vec4q lookup(Vec4q const index, void const * table) {
     if constexpr (n <= 0) return 0;
     // n > 0. Limit index
     Vec4uq index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec4uq(index) & (n-1);
     }
diff --git a/vectori512.h b/vectori512.h
index 6c6698e..97ec144 100644
--- a/vectori512.h
+++ b/vectori512.h
@@ -1,8 +1,8 @@
 /****************************  vectori512.h   *******************************
 * Author:        Agner Fog
 * Date created:  2014-07-23
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector class library
 * Description:
 * Header file defining 512-bit integer vector classes for 32 and 64 bit integers.
@@ -22,7 +22,7 @@
 * Each vector object is represented internally in the CPU as a 512-bit register.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -1676,7 +1676,10 @@ static inline Vec16i lookup(Vec16i const index, void const * table) {
     }
     // n > 32. Limit index
     Vec16ui index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec16ui(index) & (n-1);
     }
@@ -1707,7 +1710,10 @@ static inline Vec8q lookup(Vec8q const index, void const * table) {
     }
     // n > 16. Limit index
     Vec8uq index1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        index1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         index1 = Vec8uq(index) & (n-1);
     }
diff --git a/vectori512e.h b/vectori512e.h
index 7e3569a..1e04df1 100644
--- a/vectori512e.h
+++ b/vectori512e.h
@@ -1,8 +1,8 @@
 /****************************  vectori512e.h   *******************************
 * Author:        Agner Fog
 * Date created:  2014-07-23
-* Last modified: 2022-07-20
-* Version:       2.02.00
+* Last modified: 2023-06-03
+* Version:       2.02.01
 * Project:       vector classes
 * Description:
 * Header file defining 512-bit integer vector classes for 32 and 64 bit integers.
@@ -21,7 +21,7 @@
 * Each vector object is represented internally in the CPU as two 256-bit registers.
 * This header file defines operators and functions for these vectors.
 *
-* (c) Copyright 2012-2022 Agner Fog.
+* (c) Copyright 2012-2023 Agner Fog.
 * Apache License version 2.0 or later.
 *****************************************************************************/
 
@@ -1847,7 +1847,10 @@ static inline Vec16i lookup(Vec16i const index, void const * table) {
     if constexpr (n <= 16) return lookup16(index, Vec16i().load(table));
     // n > 16. Limit index
     Vec16ui i1;
-    if constexpr ((n & (n - 1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        i1 = index;
+    }
+    else if constexpr ((n & (n - 1)) == 0) {
         // n is a power of 2, make index modulo n
         i1 = Vec16ui(index) & (n - 1);
     }
@@ -1899,7 +1902,10 @@ static inline Vec8q lookup(Vec8q const index, void const * table) {
     }
     // n > 8. Limit index
     Vec8uq i1;
-    if constexpr ((n & (n-1)) == 0) {
+    if constexpr (n == INT_MAX) {
+        i1 = index;
+    }
+    else if constexpr ((n & (n-1)) == 0) {
         // n is a power of 2, make index modulo n
         i1 = Vec8uq(index) & (n-1);
     }