diff --git a/scripts/get_native_properties.sh b/scripts/get_native_properties.sh
deleted file mode 100755
index fb124021a31..00000000000
--- a/scripts/get_native_properties.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/bin/sh
-
-#
-# Returns properties of the native system.
-# best architecture as supported by the CPU
-# filename of the best binary uploaded as an artifact during CI
-#
-
-# Check if all the given flags are present in the CPU flags list
-check_flags() {
- for flag; do
- printf '%s\n' "$flags" | grep -q -w "$flag" || return 1
- done
-}
-
-# Set the CPU flags list
-# remove underscores and points from flags, e.g. gcc uses avx512vnni, while some cpuinfo can have avx512_vnni, some systems use sse4_1 others sse4.1
-get_flags() {
- flags=$(awk '/^flags[ \t]*:|^Features[ \t]*:/{gsub(/^flags[ \t]*:[ \t]*|^Features[ \t]*:[ \t]*|[_.]/, ""); line=$0} END{print line}' /proc/cpuinfo)
-}
-
-# Check for gcc march "znver1" or "znver2" https://en.wikichip.org/wiki/amd/cpuid
-check_znver_1_2() {
- vendor_id=$(awk '/^vendor_id/{print $3; exit}' /proc/cpuinfo)
- cpu_family=$(awk '/^cpu family/{print $4; exit}' /proc/cpuinfo)
- [ "$vendor_id" = "AuthenticAMD" ] && [ "$cpu_family" = "23" ] && znver_1_2=true
-}
-
-# Set the file CPU x86_64 architecture
-set_arch_x86_64() {
- if check_flags 'avx512vnni' 'avx512dq' 'avx512f' 'avx512bw' 'avx512vl'; then
- true_arch='x86-64-vnni256'
- elif check_flags 'avx512f' 'avx512bw'; then
- true_arch='x86-64-avx512'
- elif [ -z "${znver_1_2+1}" ] && check_flags 'bmi2'; then
- true_arch='x86-64-bmi2'
- elif check_flags 'avx2'; then
- true_arch='x86-64-avx2'
- elif check_flags 'sse41' && check_flags 'popcnt'; then
- true_arch='x86-64-sse41-popcnt'
- else
- true_arch='x86-64'
- fi
-}
-
-# Check the system type
-uname_s=$(uname -s)
-uname_m=$(uname -m)
-case $uname_s in
- 'Darwin') # Mac OSX system
- case $uname_m in
- 'arm64')
- true_arch='apple-silicon'
- file_arch='x86-64-sse41-popcnt' # Supported by Rosetta 2
- ;;
- 'x86_64')
- flags=$(sysctl -n machdep.cpu.features machdep.cpu.leaf7_features | tr '\n' ' ' | tr '[:upper:]' '[:lower:]' | tr -d '_.')
- set_arch_x86_64
- if [ "$true_arch" = 'x86-64-vnni256' ] || [ "$true_arch" = 'x86-64-avx512' ]; then
- file_arch='x86-64-bmi2'
- fi
- ;;
- esac
- file_os='macos'
- file_ext='tar'
- ;;
- 'Linux') # Linux system
- get_flags
- case $uname_m in
- 'x86_64')
- file_os='ubuntu'
- check_znver_1_2
- set_arch_x86_64
- ;;
- 'i686')
- file_os='ubuntu'
- true_arch='x86-32'
- ;;
- 'aarch64')
- file_os='android'
- true_arch='armv8'
- if check_flags 'asimddp'; then
- true_arch="$true_arch-dotprod"
- fi
- ;;
- 'armv7'*)
- file_os='android'
- true_arch='armv7'
- if check_flags 'neon'; then
- true_arch="$true_arch-neon"
- fi
- ;;
- *) # Unsupported machine type, exit with error
- printf 'Unsupported machine type: %s\n' "$uname_m"
- exit 1
- ;;
- esac
- file_ext='tar'
- ;;
- 'CYGWIN'*|'MINGW'*|'MSYS'*) # Windows system with POSIX compatibility layer
- get_flags
- check_znver_1_2
- set_arch_x86_64
- file_os='windows'
- file_ext='zip'
- ;;
- *)
- # Unknown system type, exit with error
- printf 'Unsupported system type: %s\n' "$uname_s"
- exit 1
- ;;
-esac
-
-if [ -z "$file_arch" ]; then
- file_arch=$true_arch
-fi
-
-file_name="stockfish-$file_os-$file_arch.$file_ext"
-
-printf '%s %s\n' "$true_arch" "$file_name"
diff --git a/src/Makefile b/src/Makefile
index 042d9479cc8..87e93cb581b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,1035 +14,400 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
+default: help
-### ==========================================================================
-### Section 1. General Configuration
-### ==========================================================================
+.PHONY: default help strip install clean objclean profileclean net format \
+ all config-sanity analyze build profile-build \
+ gcc-profile-make gcc-profile-use \
+ clang-profile-make clang-profile-use \
+ icx-profile-make icx-profile-use
-### Establish the operating system name
-KERNEL := $(shell uname -s)
-ifeq ($(KERNEL),Linux)
- OS := $(shell uname -o)
-endif
+VPATH = syzygy:nnue:nnue/features
-### Target Windows OS
-ifeq ($(OS),Windows_NT)
- ifneq ($(COMP),ndk)
- target_windows = yes
- endif
-else ifeq ($(COMP),mingw)
- target_windows = yes
- ifeq ($(WINE_PATH),)
- WINE_PATH := $(shell which wine)
- endif
-endif
+SRCS := $(shell find . -name "*.cpp" ! -path "./incbin/*")
+OBJS := $(notdir $(SRCS:.cpp=.o))
+HEADERS := $(shell find . -name "*.h" ! -path "./incbin/*")
+
+INSTALL_PREFIX := /usr/local
+INSTALL_PATH := $(INSTALL_PREFIX)/bin
-### Executable name
-ifeq ($(target_windows),yes)
- EXE = stockfish.exe
+ifeq ($(OS),Windows_NT)
+ INSTALL_EXE := stockfish.exe
else
- EXE = stockfish
+ INSTALL_EXE := stockfish
endif
-### Installation dir definitions
-PREFIX = /usr/local
-BINDIR = $(PREFIX)/bin
-
-### Built-in benchmark for pgo-builds
-PGOBENCH = $(WINE_PATH) ./$(EXE) bench
+KERNEL := $(shell uname -s)
-### Source and object files
-SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
- misc.cpp movegen.cpp movepick.cpp position.cpp \
- search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
- nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp memory.cpp
+strip:
+ -@test -f stockfish && strip stockfish
+ -@test -f stockfish.exe && strip stockfish.exe
-HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \
- nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \
- nnue/layers/affine_transform_sparse_input.h nnue/layers/clipped_relu.h nnue/layers/simd.h \
- nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
- nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
- search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
- tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h
+install:
+ mkdir -p -m 755 $(INSTALL_PATH)
+ cp $(INSTALL_EXE) $(INSTALL_PATH)
+ strip $(INSTALL_PATH)/$(INSTALL_EXE)
-OBJS = $(notdir $(SRCS:.cpp=.o))
+clean: objclean profileclean
+ @rm -f .depend
-VPATH = syzygy:nnue:nnue/features
+objclean:
+ @rm -f stockfish stockfish.exe $(OBJS) *.o.tmp
-### ==========================================================================
-### Section 2. High-level Configuration
-### ==========================================================================
-#
-# flag --- Comp switch --- Description
-# ----------------------------------------------------------------------------
-#
-# debug = yes/no --- -DNDEBUG --- Enable/Disable debug mode
-# sanitize = none/ ... (-fsanitize )
-# --- ( undefined ) --- enable undefined behavior checks
-# --- ( thread ) --- enable threading error checks
-# --- ( address ) --- enable memory access checks
-# --- ...etc... --- see compiler documentation for supported sanitizers
-# optimize = yes/no --- (-O3/-fast etc.) --- Enable/Disable optimizations
-# arch = (name) --- (-arch) --- Target architecture
-# bits = 64/32 --- -DIS_64BIT --- 64-/32-bit operating system
-# prefetch = yes/no --- -DUSE_PREFETCH --- Use prefetch asm-instruction
-# popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction
-# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction
-# sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions
-# mmx = yes/no --- -mmmx --- Use Intel MMX instructions
-# sse2 = yes/no --- -msse2 --- Use Intel Streaming SIMD Extensions 2
-# ssse3 = yes/no --- -mssse3 --- Use Intel Supplemental Streaming SIMD Extensions 3
-# sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1
-# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2
-# avxvnni = yes/no --- -mavxvnni --- Use Intel Vector Neural Network Instructions AVX
-# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512
-# vnni256 = yes/no --- -mavx256vnni --- Use Intel Vector Neural Network Instructions 512 with 256bit operands
-# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
-# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
-# dotprod = yes/no --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
-#
-# Note that Makefile is space sensitive, so when adding new architectures
-# or modifying existing flags, you have to make sure there are no extra spaces
-# at the end of the line for flag values.
-#
-# Example of use for these flags:
-# make build ARCH=x86-64-avx512 debug=yes sanitize="address undefined"
+profileclean:
+ @rm -f PGOBENCH.out
+ @rm -rf profdir
+ @rm -f stockfish.profdata *.profraw
+net:
+ @$(SHELL) ../scripts/net.sh
-### 2.1. General and architecture defaults
+format: CLANG_FORMAT := $(shell command -v clang-format-18 2> /dev/null || \
+ command -v clang-format 2> /dev/null)
+format:
+ @test -n "$(CLANG_FORMAT)" || ( \
+ echo "clang-format not found. Please install clang-format-18."; false \
+ )
+ @$(CLANG_FORMAT) -i $(SRCS) $(HEADERS) -style=file
-ifeq ($(ARCH),)
- ARCH = native
-endif
+### ==========================================================================
-ifeq ($(ARCH), native)
- override ARCH := $(shell $(SHELL) ../scripts/get_native_properties.sh | cut -d " " -f 1)
-endif
+CXX_REQUIRED_RULES := analyze config-sanity build profile-build all \
+ gcc-profile-make gcc-profile-use \
+ clang-profile-make clang-profile-use \
+ icx-profile-make icx-profile-use
-# explicitly check for the list of supported architectures (as listed with make help),
-# the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
-ifeq ($(ARCH), $(filter $(ARCH), \
- x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \
- x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
- x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \
- armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64 loongarch64))
- SUPPORTED_ARCH=true
-else
- SUPPORTED_ARCH=false
-endif
+ifeq ($(MAKELEVEL),0)
+ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)
optimize = yes
debug = no
sanitize = none
-bits = 64
-prefetch = no
-popcnt = no
-pext = no
-sse = no
-mmx = no
-sse2 = no
-ssse3 = no
-sse41 = no
-avx2 = no
-avxvnni = no
-avx512 = no
-vnni256 = no
-vnni512 = no
-neon = no
-dotprod = no
-arm_version = 0
-STRIP = strip
-
-ifneq ($(shell which clang-format-18 2> /dev/null),)
- CLANG-FORMAT = clang-format-18
-else
- CLANG-FORMAT = clang-format
-endif
-
-### 2.2 Architecture specific
-
-ifeq ($(findstring x86,$(ARCH)),x86)
-# x86-32/64
-
-ifeq ($(findstring x86-32,$(ARCH)),x86-32)
- arch = i386
- bits = 32
- sse = no
- mmx = yes
+ifeq ($(shell command -v $(CXX) 2> /dev/null),)
+ $(error Compiler $(CXX) not found)
+endif
+
+define test-compiler-macro
+$(shell echo | $(CXX) -dM -x c++ -E - | \
+ grep -E "^#define[[:space:]]+$(1)$|([[:space:]]+.*)" > /dev/null 2>&1 && echo 1)
+endef
+
+define get-compiler-macro
+$(shell echo | $(CXX) -dM -x c++ -E - | \
+ grep -E "^#define[[:space:]]+$(1)$|([[:space:]]+.*)" | \
+ sed "s/^#define[[:space:]]\+$(1)[[:space:]]\+//")
+endef
+
+### 1. Detect compiler type
+
+ifeq ($(call test-compiler-macro,__GNUC__),1)
+ ifeq ($(call test-compiler-macro,__INTEL_LLVM_COMPILER),1)
+ $(info Using Intel oneAPI DPC++/C++ Compiler) $(info )
+ COMP := icx
+ profile_make = icx-profile-make
+ profile_use = icx-profile-use
+ else ifeq ($(call test-compiler-macro,__clang__),1)
+ $(info Using LLVM C/C++ Compiler (Clang)) $(info )
+ COMP := clang
+ CLANG_VERSION := $(call get-compiler-macro,__clang_major__)
+ LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_VERSION) 2> /dev/null || \
+ command -v llvm-profdata 2> /dev/null)
+ profile_make = clang-profile-make
+ profile_use = clang-profile-use
+ export LLVM_PROFDATA
+ else
+ $(info Using GNU C/C++ Compiler) $(info )
+ COMP := gcc
+ GCC_VERSION := $(call get-compiler-macro,__GNUC__)
+ profile_make = gcc-profile-make
+ profile_use = gcc-profile-use
+ endif
+endif
+
+ifneq ($(filter $(COMP),gcc clang),)
+ MINGW := $(call test-compiler-macro,__MINGW32__)
+endif
+
+ifeq ($(MINGW),1)
+ EXE = stockfish.exe
else
- arch = x86_64
- sse = yes
- sse2 = yes
-endif
-
-ifeq ($(findstring -sse,$(ARCH)),-sse)
- sse = yes
-endif
-
-ifeq ($(findstring -popcnt,$(ARCH)),-popcnt)
- popcnt = yes
+ EXE = stockfish
endif
-ifeq ($(findstring -mmx,$(ARCH)),-mmx)
- mmx = yes
-endif
-
-ifeq ($(findstring -sse2,$(ARCH)),-sse2)
- sse = yes
- sse2 = yes
-endif
+export COMP MINGW EXE
-ifeq ($(findstring -ssse3,$(ARCH)),-ssse3)
- sse = yes
- sse2 = yes
- ssse3 = yes
-endif
-
-ifeq ($(findstring -sse41,$(ARCH)),-sse41)
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
-endif
+### 2. Set compiler options
-ifeq ($(findstring -modern,$(ARCH)),-modern)
- $(warning *** ARCH=$(ARCH) is deprecated, defaulting to ARCH=x86-64-sse41-popcnt. Execute `make help` for a list of available architectures. ***)
- $(shell sleep 5)
- popcnt = yes
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
-endif
-
-ifeq ($(findstring -avx2,$(ARCH)),-avx2)
- popcnt = yes
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
- avx2 = yes
-endif
+# GNU C Compiler
+# https://gcc.gnu.org/onlinedocs/gcc/Option-Index.html
+#
+# Clang Compiler
+# https://clang.llvm.org/docs/ClangCommandLineReference.html
+# https://clang.llvm.org/docs/DiagnosticsReference.html
+#
+# Intel oneAPI DPC++/C++ Compiler
+# https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/developer-guide-reference/2024-2/alphabetical-option-list.html
-ifeq ($(findstring -avxvnni,$(ARCH)),-avxvnni)
- popcnt = yes
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
- avx2 = yes
- avxvnni = yes
- pext = yes
-endif
+### 2.1. Common options
-ifeq ($(findstring -bmi2,$(ARCH)),-bmi2)
- popcnt = yes
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
- avx2 = yes
- pext = yes
-endif
+SF_CXXFLAGS := -std=c++17 -I. -Wall -DUSE_PTHREADS
+SF_LDFLAGS :=
-ifeq ($(findstring -avx512,$(ARCH)),-avx512)
- popcnt = yes
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
- avx2 = yes
- pext = yes
- avx512 = yes
-endif
+SF_LIBS := pthread
-ifeq ($(findstring -vnni256,$(ARCH)),-vnni256)
- popcnt = yes
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
- avx2 = yes
- pext = yes
- vnni256 = yes
-endif
+### 2.2. Compiler-specific options
-ifeq ($(findstring -vnni512,$(ARCH)),-vnni512)
- popcnt = yes
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
- avx2 = yes
- pext = yes
- avx512 = yes
- vnni512 = yes
+ifeq ($(COMP),gcc)
+ SF_CXXFLAGS += -pedantic -Wextra -Wcast-qual -Wmissing-declarations \
+ -Wshadow
+else ifeq ($(COMP),clang)
+ SF_CXXFLAGS += -pedantic -Wextra -Wcast-qual -Wconditional-uninitialized \
+ -Wmissing-prototypes -Wshadow
+else ifeq ($(COMP),icx)
+ SF_CXXFLAGS += -Wabi -Wmissing-declarations -Wmissing-prototypes -Wshadow
endif
-ifeq ($(sse),yes)
- prefetch = yes
-endif
+### 2.3. Optimization options
-# 64-bit pext is not available on x86-32
-ifeq ($(bits),32)
- pext = no
-endif
+ifeq ($(optimize),yes)
+ SF_CXXFLAGS += -O3
+
+ ifeq ($(COMP),gcc)
+ SF_CXXFLAGS += -funroll-loops
+ ifeq ($(shell expr $(GCC_VERSION) \< 12),1)
+ SF_CXXFLAGS += -flto
+ SF_LDFLAGS += -flto
+ else
+ SF_CXXFLAGS += -flto=jobserver
+ SF_LDFLAGS += -flto=jobserver
+ endif
+ SF_CXXFLAGS += -flto-partition=one
+ SF_LDFLAGS += -flto-partition=one
+ else ifeq ($(COMP),clang)
+ SF_CXXFLAGS += -funroll-loops -flto=full
+ SF_LDFLAGS += -flto=full
+ ifeq ($(shell expr $(CLANG_VERSION) \< 16),1)
+ SF_CXXFLAGS += -fexperimental-new-pass-manager
+ endif
+ else ifeq ($(COMP),icx)
+ SF_CXXFLAGS += -flto=full
+ SF_LDFLAGS += -flto=full
+ endif
+endif
+
+### 2.4. Debug options
+ifeq ($(debug),no)
+ SF_CXXFLAGS += -DNDEBUG
else
-
-# all other architectures
-
-ifeq ($(ARCH),general-32)
- arch = any
- bits = 32
-endif
-
-ifeq ($(ARCH),general-64)
- arch = any
-endif
-
-ifeq ($(ARCH),armv7)
- arch = armv7
- prefetch = yes
- bits = 32
- arm_version = 7
-endif
-
-ifeq ($(ARCH),armv7-neon)
- arch = armv7
- prefetch = yes
- popcnt = yes
- neon = yes
- bits = 32
- arm_version = 7
-endif
-
-ifeq ($(ARCH),armv8)
- arch = armv8
- prefetch = yes
- popcnt = yes
- neon = yes
- arm_version = 8
-endif
-
-ifeq ($(ARCH),armv8-dotprod)
- arch = armv8
- prefetch = yes
- popcnt = yes
- neon = yes
- dotprod = yes
- arm_version = 8
-endif
-
-ifeq ($(ARCH),apple-silicon)
- arch = arm64
- prefetch = yes
- popcnt = yes
- neon = yes
- dotprod = yes
- arm_version = 8
-endif
-
-ifeq ($(ARCH),ppc-32)
- arch = ppc
- bits = 32
-endif
-
-ifeq ($(ARCH),ppc-64)
- arch = ppc64
- popcnt = yes
- prefetch = yes
-endif
-
-ifeq ($(findstring e2k,$(ARCH)),e2k)
- arch = e2k
- mmx = yes
- bits = 64
- sse = yes
- sse2 = yes
- ssse3 = yes
- sse41 = yes
- popcnt = yes
-endif
-
-ifeq ($(ARCH),riscv64)
- arch = riscv64
-endif
-
-ifeq ($(ARCH),loongarch64)
- arch = loongarch64
+ SF_CXXFLAGS += -g
endif
-endif
-
-### ==========================================================================
-### Section 3. Low-level Configuration
-### ==========================================================================
+### 2.5. Sanitizer options
-### 3.1 Selecting compiler (default = gcc)
-ifeq ($(MAKELEVEL),0)
- export ENV_CXXFLAGS := $(CXXFLAGS)
- export ENV_DEPENDFLAGS := $(DEPENDFLAGS)
- export ENV_LDFLAGS := $(LDFLAGS)
+ifneq ($(sanitize),none)
+ SF_CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize))
endif
-CXXFLAGS = $(ENV_CXXFLAGS) -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS)
-DEPENDFLAGS = $(ENV_DEPENDFLAGS) -std=c++17
-LDFLAGS = $(ENV_LDFLAGS) $(EXTRALDFLAGS)
+### 2.6. Include Git commit hash and date
-ifeq ($(COMP),)
- COMP=gcc
+GIT_SHA := $(shell git rev-parse --short=8 HEAD 2> /dev/null)
+ifneq ($(GIT_SHA),)
+ SF_CXXFLAGS += -DGIT_SHA=$(GIT_SHA)
endif
-ifeq ($(COMP),gcc)
- comp=gcc
- CXX=g++
- CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations
-
- ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64))
- ifeq ($(OS),Android)
- CXXFLAGS += -m$(bits)
- LDFLAGS += -m$(bits)
- endif
- ifeq ($(ARCH),riscv64)
- CXXFLAGS += -latomic
- endif
- else ifeq ($(ARCH),loongarch64)
- CXXFLAGS += -latomic
- else
- CXXFLAGS += -m$(bits)
- LDFLAGS += -m$(bits)
- endif
-
- ifeq ($(arch),$(filter $(arch),armv7))
- LDFLAGS += -latomic
- endif
-
- ifneq ($(KERNEL),Darwin)
- LDFLAGS += -Wl,--no-as-needed
- endif
+GIT_DATE := $(shell git show -s --date=format:'%Y%m%d' --format=%cd HEAD 2> /dev/null)
+ifneq ($(GIT_DATE),)
+ SF_CXXFLAGS += -DGIT_DATE=$(GIT_DATE)
endif
-ifeq ($(target_windows),yes)
- LDFLAGS += -static
-endif
+### 2.7. Add flags based on target OS
-ifeq ($(COMP),mingw)
- comp=mingw
-
- ifeq ($(bits),64)
- ifeq ($(shell which x86_64-w64-mingw32-c++-posix 2> /dev/null),)
- CXX=x86_64-w64-mingw32-c++
- else
- CXX=x86_64-w64-mingw32-c++-posix
- endif
- else
- ifeq ($(shell which i686-w64-mingw32-c++-posix 2> /dev/null),)
- CXX=i686-w64-mingw32-c++
- else
- CXX=i686-w64-mingw32-c++-posix
- endif
- endif
- CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations
+ifeq ($(MINGW),1)
+ SF_LDFLAGS += -static
endif
-ifeq ($(COMP),icx)
- comp=icx
- CXX=icpx
- CXXFLAGS += --intel -pedantic -Wextra -Wshadow -Wmissing-prototypes \
- -Wconditional-uninitialized -Wabi -Wdeprecated
-endif
-
-ifeq ($(COMP),clang)
- comp=clang
- CXX=clang++
- ifeq ($(target_windows),yes)
- CXX=x86_64-w64-mingw32-clang++
- endif
-
- CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-prototypes \
- -Wconditional-uninitialized
-
- ifeq ($(filter $(KERNEL),Darwin OpenBSD FreeBSD),)
- ifeq ($(target_windows),)
- ifneq ($(RTLIB),compiler-rt)
- LDFLAGS += -latomic
- endif
- endif
- endif
-
- ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64))
- ifeq ($(OS),Android)
- CXXFLAGS += -m$(bits)
- LDFLAGS += -m$(bits)
- endif
- ifeq ($(ARCH),riscv64)
- CXXFLAGS += -latomic
- endif
- else ifeq ($(ARCH),loongarch64)
- CXXFLAGS += -latomic
- else
- CXXFLAGS += -m$(bits)
- LDFLAGS += -m$(bits)
- endif
-endif
+endif # CXX_REQUIRED_RULES
-ifeq ($(KERNEL),Darwin)
- CXXFLAGS += -mmacosx-version-min=10.15
- LDFLAGS += -mmacosx-version-min=10.15
- ifneq ($(arch),any)
- CXXFLAGS += -arch $(arch)
- LDFLAGS += -arch $(arch)
- endif
- XCRUN = xcrun
-endif
+### 3. Add flags from architecture-specific Makefile
+### Note that this section is not enclosed in the CXX_REQUIRED_RULES block;
+### Users shall be able to see the help text even when there is no compiler.
-# To cross-compile for Android, NDK version r21 or later is recommended.
-# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils.
-# Currently we don't know how to make PGO builds with the NDK yet.
-ifeq ($(COMP),ndk)
- CXXFLAGS += -stdlib=libc++ -fPIE
- comp=clang
- ifeq ($(arch),armv7)
- CXX=armv7a-linux-androideabi16-clang++
- CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
- ifneq ($(shell which arm-linux-androideabi-strip 2>/dev/null),)
- STRIP=arm-linux-androideabi-strip
- else
- STRIP=llvm-strip
- endif
- endif
- ifeq ($(arch),armv8)
- CXX=aarch64-linux-android21-clang++
- ifneq ($(shell which aarch64-linux-android-strip 2>/dev/null),)
- STRIP=aarch64-linux-android-strip
- else
- STRIP=llvm-strip
- endif
- endif
- ifeq ($(arch),x86_64)
- CXX=x86_64-linux-android21-clang++
- ifneq ($(shell which x86_64-linux-android-strip 2>/dev/null),)
- STRIP=x86_64-linux-android-strip
- else
- STRIP=llvm-strip
- endif
- endif
- LDFLAGS += -static-libstdc++ -pie -lm -latomic
+ifeq ($(ARCH),)
+ override ARCH := native
endif
-ifeq ($(comp),icx)
- profile_make = icx-profile-make
- profile_use = icx-profile-use
-else ifeq ($(comp),clang)
- profile_make = clang-profile-make
- profile_use = clang-profile-use
+ifeq ($(ARCH),native)
+ ARCH_NATIVE := y
+ SF_CXXFLAGS += -march=native -DARCH_NATIVE
else
- profile_make = gcc-profile-make
- profile_use = gcc-profile-use
- ifeq ($(KERNEL),Darwin)
- EXTRAPROFILEFLAGS = -fvisibility=hidden
- endif
-endif
+ ifneq ($(filter x86%,$(ARCH)),)
+ ARCH_FAMILY := i386
+ else ifneq ($(filter arm%,$(ARCH)),)
+ ARCH_FAMILY := arm
+ else
+ ARCH_FAMILY := generic
+ endif
-### Allow overwriting CXX from command line
-ifdef COMPCXX
- CXX=$(COMPCXX)
+ include ./arch/$(ARCH_FAMILY)/Makefile
endif
-### Sometimes gcc is really clang
-ifeq ($(COMP),gcc)
- gccversion := $(shell $(CXX) --version 2>/dev/null)
- gccisclang := $(findstring clang,$(gccversion))
- ifneq ($(gccisclang),)
- profile_make = clang-profile-make
- profile_use = clang-profile-use
- endif
-endif
+export ARCH
-### On mingw use Windows threads, otherwise POSIX
-ifneq ($(comp),mingw)
- CXXFLAGS += -DUSE_PTHREADS
- # On Android Bionic's C library comes with its own pthread implementation bundled in
- ifneq ($(OS),Android)
- # Haiku has pthreads in its libroot, so only link it in on other platforms
- ifneq ($(KERNEL),Haiku)
- ifneq ($(COMP),ndk)
- LDFLAGS += -lpthread
- endif
- endif
- endif
-endif
+SF_CXXFLAGS += -DARCH=$(ARCH)
-### 3.2.1 Debugging
-ifeq ($(debug),no)
- CXXFLAGS += -DNDEBUG
-else
- CXXFLAGS += -g
-endif
-
-### 3.2.2 Debugging with undefined behavior sanitizers
-ifneq ($(sanitize),none)
- CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize))
- LDFLAGS += $(addprefix -fsanitize=,$(sanitize))
-endif
+### 4. Extra flags for cross-compilation
+### Information of target architecture is needed here.
-### 3.3 Optimization
-ifeq ($(optimize),yes)
+ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)
- CXXFLAGS += -O3 -funroll-loops
-
- ifeq ($(comp),gcc)
- ifeq ($(OS), Android)
- CXXFLAGS += -fno-gcse -mthumb -march=armv7-a -mfloat-abi=softfp
- endif
- endif
-
- ifeq ($(KERNEL),Darwin)
- ifeq ($(comp),$(filter $(comp),clang icx))
- CXXFLAGS += -mdynamic-no-pic
- endif
-
- ifeq ($(comp),gcc)
- ifneq ($(arch),arm64)
- CXXFLAGS += -mdynamic-no-pic
- endif
- endif
- endif
-
- ifeq ($(comp),clang)
- clangmajorversion := $(shell $(CXX) -dumpversion 2>/dev/null | cut -f1 -d.)
- ifeq ($(shell expr $(clangmajorversion) \< 16),1)
- CXXFLAGS += -fexperimental-new-pass-manager
- endif
- endif
+# Android NDK
+ifneq ($(filter $(ARCH_FAMILY),i386 arm),)
+ ifeq ($(call test-compiler-macro,__ANDROID__),1)
+ SF_CXXFLAGS += -stdlib=libc++ -fPIE
+ SF_LDFLAGS += -static-libstdc++ -pie
+ SF_LIBS += m atomic
+ endif
endif
-### 3.4 Bits
-ifeq ($(bits),64)
- CXXFLAGS += -DIS_64BIT
+# Link atomic library if not i386/arm family
+ifneq ($(ARCH_NATIVE),y)
+ ifeq ($(filter $(ARCH_FAMILY),i386 arm),)
+ SF_LIBS += atomic
+ endif
endif
-### 3.5 prefetch and popcount
-ifeq ($(prefetch),yes)
- ifeq ($(sse),yes)
- CXXFLAGS += -msse
- endif
-else
- CXXFLAGS += -DNO_PREFETCH
-endif
+endif # CXX_REQUIRED_RULES
+endif # MAKELEVEL=0
-ifeq ($(popcnt),yes)
- ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8 arm64))
- CXXFLAGS += -DUSE_POPCNT
- else
- CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT
- endif
-endif
+SF_CXXFLAGS := $(strip $(SF_CXXFLAGS) $(CXXFLAGS))
+SF_LDFLAGS := $(strip $(SF_LDFLAGS) $(LDFLAGS))
+SF_LIBS := $(strip $(SF_LIBS) $(LIBS))
-### 3.6 SIMD architectures
-ifeq ($(avx2),yes)
- CXXFLAGS += -DUSE_AVX2
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -mavx2 -mbmi
- endif
-endif
+export SF_CXXFLAGS SF_LDFLAGS SF_LIBS
-ifeq ($(avxvnni),yes)
- CXXFLAGS += -DUSE_VNNI -DUSE_AVXVNNI
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -mavxvnni
- endif
-endif
+### ==========================================================================
-ifeq ($(avx512),yes)
- CXXFLAGS += -DUSE_AVX512
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -mavx512f -mavx512bw
- endif
-endif
+define HELP_STRING
+To see architecture-specific build options, run 'make help ARCH='.
+Currently supported values: x86, arm, generic
-ifeq ($(vnni256),yes)
- CXXFLAGS += -DUSE_VNNI
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256
- endif
-endif
+How-to-build examples:
-ifeq ($(vnni512),yes)
- CXXFLAGS += -DUSE_VNNI
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=512
- endif
-endif
+ make profile-build
-ifeq ($(sse41),yes)
- CXXFLAGS += -DUSE_SSE41
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -msse4.1
- endif
-endif
+Build Stockfish with profile-guided optimization (PGO) for the current
+architecture.
-ifeq ($(ssse3),yes)
- CXXFLAGS += -DUSE_SSSE3
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -mssse3
- endif
-endif
+ make build ARCH=x86-64-avx2 CXX=clang++-18
-ifeq ($(sse2),yes)
- CXXFLAGS += -DUSE_SSE2
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -msse2
- endif
-endif
+Build Stockfish for the x86-64 architecture with AVX2/BMI2 support using clang++-18.
-ifeq ($(mmx),yes)
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -mmmx
- endif
-endif
+Check the Stockfish wiki for advanced build configuration.
-ifeq ($(neon),yes)
- CXXFLAGS += -DUSE_NEON=$(arm_version)
- ifeq ($(KERNEL),Linux)
- ifneq ($(COMP),ndk)
- ifneq ($(arch),armv8)
- CXXFLAGS += -mfpu=neon
- endif
- endif
- endif
-endif
+endef
+export HELP_STRING
-ifeq ($(dotprod),yes)
- CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_NEON_DOTPROD
-endif
+# Print how-to-build help text if architecture is not set, otherwise
+# list all available build presets for the selected architecture.
+ifneq ($(ARCH_NATIVE),y)
-### 3.7 pext
-ifeq ($(pext),yes)
- CXXFLAGS += -DUSE_PEXT
- ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
- CXXFLAGS += -mbmi2
- endif
-endif
+help: help-arch
-### 3.8.1 Try to include git commit sha for versioning
-GIT_SHA := $(shell git rev-parse HEAD 2>/dev/null | cut -c 1-8)
-ifneq ($(GIT_SHA), )
- CXXFLAGS += -DGIT_SHA=$(GIT_SHA)
-endif
+config-sanity: config-sanity-arch
-### 3.8.2 Try to include git commit date for versioning
-GIT_DATE := $(shell git show -s --date=format:'%Y%m%d' --format=%cd HEAD 2>/dev/null)
-ifneq ($(GIT_DATE), )
- CXXFLAGS += -DGIT_DATE=$(GIT_DATE)
-endif
+else
-### 3.8.3 Try to include architecture
-ifneq ($(ARCH), )
- CXXFLAGS += -DARCH=$(ARCH)
-endif
+help:
+ @echo "$${HELP_STRING}"
-### 3.9 Link Time Optimization
-### This is a mix of compile and link time options because the lto link phase
-### needs access to the optimization flags.
-ifeq ($(optimize),yes)
-ifeq ($(debug), no)
- ifeq ($(comp),$(filter $(comp),clang icx))
- CXXFLAGS += -flto=full
- ifeq ($(comp),icx)
- CXXFLAGS += -fwhole-program-vtables
- endif
- ifeq ($(target_windows),yes)
- CXXFLAGS += -fuse-ld=lld
- endif
- LDFLAGS += $(CXXFLAGS)
-
-# GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be
-# GCC on some systems.
- else ifeq ($(comp),gcc)
- ifeq ($(gccisclang),)
- CXXFLAGS += -flto -flto-partition=one
- LDFLAGS += $(CXXFLAGS) -flto=jobserver
- else
- CXXFLAGS += -flto=full
- LDFLAGS += $(CXXFLAGS)
- endif
-
-# To use LTO and static linking on Windows,
-# the tool chain requires gcc version 10.1 or later.
- else ifeq ($(comp),mingw)
- CXXFLAGS += -flto -flto-partition=one
- LDFLAGS += $(CXXFLAGS) -save-temps
- endif
-endif
endif
-### 3.10 Android 5 can only run position independent executables. Note that this
-### breaks Android 4.0 and earlier.
-ifeq ($(OS), Android)
- CXXFLAGS += -fPIE
- LDFLAGS += -fPIE -pie
-endif
+define CONFIG_SANITY_STRING
-### ==========================================================================
-### Section 4. Public Targets
-### ==========================================================================
+Build options:
+ optimize: $(optimize)
+ debug: $(debug)
+ sanitize: $(sanitize)
-help:
- @echo ""
- @echo "To compile stockfish, type: "
- @echo ""
- @echo "make -j target [ARCH=arch] [COMP=compiler] [COMPCXX=cxx]"
- @echo ""
- @echo "Supported targets:"
- @echo ""
- @echo "help > Display architecture details"
- @echo "profile-build > standard build with profile-guided optimization"
- @echo "build > skip profile-guided optimization"
- @echo "net > Download the default nnue nets"
- @echo "strip > Strip executable"
- @echo "install > Install executable"
- @echo "clean > Clean up"
- @echo ""
- @echo "Supported archs:"
- @echo ""
- @echo "native > select the best architecture for the host processor (default)"
- @echo "x86-64-vnni512 > x86 64-bit with vnni 512bit support"
- @echo "x86-64-vnni256 > x86 64-bit with vnni 512bit support, limit operands to 256bit wide"
- @echo "x86-64-avx512 > x86 64-bit with avx512 support"
- @echo "x86-64-avxvnni > x86 64-bit with vnni 256bit support"
- @echo "x86-64-bmi2 > x86 64-bit with bmi2 support"
- @echo "x86-64-avx2 > x86 64-bit with avx2 support"
- @echo "x86-64-sse41-popcnt > x86 64-bit with sse41 and popcnt support"
- @echo "x86-64-modern > deprecated, currently x86-64-sse41-popcnt"
- @echo "x86-64-ssse3 > x86 64-bit with ssse3 support"
- @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 compile and popcnt support"
- @echo "x86-64 > x86 64-bit generic (with sse2 support)"
- @echo "x86-32-sse41-popcnt > x86 32-bit with sse41 and popcnt support"
- @echo "x86-32-sse2 > x86 32-bit with sse2 support"
- @echo "x86-32 > x86 32-bit generic (with mmx compile support)"
- @echo "ppc-64 > PPC 64-bit"
- @echo "ppc-32 > PPC 32-bit"
- @echo "armv7 > ARMv7 32-bit"
- @echo "armv7-neon > ARMv7 32-bit with popcnt and neon"
- @echo "armv8 > ARMv8 64-bit with popcnt and neon"
- @echo "armv8-dotprod > ARMv8 64-bit with popcnt, neon and dot product support"
- @echo "e2k > Elbrus 2000"
- @echo "apple-silicon > Apple silicon ARM64"
- @echo "general-64 > unspecified 64-bit"
- @echo "general-32 > unspecified 32-bit"
- @echo "riscv64 > RISC-V 64-bit"
- @echo "loongarch64 > LoongArch 64-bit"
- @echo ""
- @echo "Supported compilers:"
- @echo ""
- @echo "gcc > GNU compiler (default)"
- @echo "mingw > GNU compiler with MinGW under Windows"
- @echo "clang > LLVM Clang compiler"
- @echo "icx > Intel oneAPI DPC++/C++ Compiler"
- @echo "ndk > Google NDK to cross-compile for Android"
- @echo ""
- @echo "Simple examples. If you don't know what to do, you likely want to run one of: "
- @echo ""
- @echo "make -j profile-build ARCH=x86-64-avx2 # typically a fast compile for common systems "
- @echo "make -j profile-build ARCH=x86-64-sse41-popcnt # A more portable compile for 64-bit systems "
- @echo "make -j profile-build ARCH=x86-64 # A portable compile for 64-bit systems "
- @echo ""
- @echo "Advanced examples, for experienced users: "
- @echo ""
- @echo "make -j profile-build ARCH=x86-64-avxvnni"
- @echo "make -j profile-build ARCH=x86-64-avxvnni COMP=gcc COMPCXX=g++-12.0"
- @echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
- @echo ""
-ifneq ($(SUPPORTED_ARCH), true)
- @echo "Specify a supported architecture with the ARCH option for more details"
- @echo ""
-endif
+Compiler options:
+ CXX: $(CXX)
+ CXXFLAGS: $(SF_CXXFLAGS)
+ LDFLAGS: $(SF_LDFLAGS) $(SF_LIBS:%=-l%)
+endef
+export CONFIG_SANITY_STRING
-.PHONY: help analyze build profile-build strip install clean net \
- objclean profileclean config-sanity \
- icx-profile-use icx-profile-make \
- gcc-profile-use gcc-profile-make \
- clang-profile-use clang-profile-make FORCE \
- format analyze
+config-sanity: net
+ @[ "$(optimize)" = "yes" -o "$(optimize)" = "no" ]
+ @[ "$(debug)" = "yes" -o "$(debug)" = "no" ]
+ @[ ! -z "$(sanitize)" ]
+ @echo "$${CONFIG_SANITY_STRING}"
-analyze: net config-sanity objclean
- $(MAKE) -k ARCH=$(ARCH) COMP=$(COMP) $(OBJS)
+analyze: config-sanity objclean
+ @$(MAKE) -k --no-print-directory CXXFLAGS="" LDFLAGS="" $(OBJS)
-build: net config-sanity
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
+build: config-sanity
+ @$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" all
-profile-build: net config-sanity objclean profileclean
- @echo ""
+profile-build: config-sanity objclean profileclean
@echo "Step 1/4. Building instrumented executable ..."
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
- @echo ""
- @echo "Step 2/4. Running benchmark for pgo-build ..."
- $(PGOBENCH) > PGOBENCH.out 2>&1
- tail -n 4 PGOBENCH.out
- @echo ""
- @echo "Step 3/4. Building optimized executable ..."
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use)
- @echo ""
- @echo "Step 4/4. Deleting profile data ..."
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
-
-strip:
- $(STRIP) $(EXE)
-
-install:
- -mkdir -p -m 755 $(BINDIR)
- -cp $(EXE) $(BINDIR)
- $(STRIP) $(BINDIR)/$(EXE)
-
-# clean all
-clean: objclean profileclean
- @rm -f .depend *~ core
-
-# clean binaries and objects
-objclean:
- @rm -f stockfish stockfish.exe *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o
-
-# clean auxiliary profiling files
-profileclean:
- @rm -rf profdir
- @rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s PGOBENCH.out
- @rm -f stockfish.profdata *.profraw
- @rm -f stockfish.*args*
- @rm -f stockfish.*lt*
- @rm -f stockfish.res
- @rm -f ./-lstdc++.res
-
-# evaluation network (nnue)
-net:
- @$(SHELL) ../scripts/net.sh
-
-format:
- $(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file
-
-# default target
-default:
- help
-
-### ==========================================================================
-### Section 5. Private Targets
-### ==========================================================================
+ @$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_make)
+ @printf "\n%s\n" "Step 2/4. Running benchmark for pgo-build ..."
+ @$(EMULATE) ./$(EXE) bench > PGOBENCH.out 2>&1
+ @tail -n 4 PGOBENCH.out
+ @printf "\n%s\n" "Step 3/4. Building optimized executable ..."
+ @$(MAKE) --no-print-directory objclean
+ @$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_use)
+ @printf "\n%s\n" "Step 4/4. Deleting profile data ..."
+ @$(MAKE) --no-print-directory profileclean
all: $(EXE) .depend
-config-sanity: net
- @echo ""
- @echo "Config:"
- @echo "debug: '$(debug)'"
- @echo "sanitize: '$(sanitize)'"
- @echo "optimize: '$(optimize)'"
- @echo "arch: '$(arch)'"
- @echo "bits: '$(bits)'"
- @echo "kernel: '$(KERNEL)'"
- @echo "os: '$(OS)'"
- @echo "prefetch: '$(prefetch)'"
- @echo "popcnt: '$(popcnt)'"
- @echo "pext: '$(pext)'"
- @echo "sse: '$(sse)'"
- @echo "mmx: '$(mmx)'"
- @echo "sse2: '$(sse2)'"
- @echo "ssse3: '$(ssse3)'"
- @echo "sse41: '$(sse41)'"
- @echo "avx2: '$(avx2)'"
- @echo "avxvnni: '$(avxvnni)'"
- @echo "avx512: '$(avx512)'"
- @echo "vnni256: '$(vnni256)'"
- @echo "vnni512: '$(vnni512)'"
- @echo "neon: '$(neon)'"
- @echo "dotprod: '$(dotprod)'"
- @echo "arm_version: '$(arm_version)'"
- @echo "target_windows: '$(target_windows)'"
- @echo ""
- @echo "Flags:"
- @echo "CXX: $(CXX)"
- @echo "CXXFLAGS: $(CXXFLAGS)"
- @echo "LDFLAGS: $(LDFLAGS)"
- @echo ""
- @echo "Testing config sanity. If this fails, try 'make help' ..."
- @echo ""
- @test "$(debug)" = "yes" || test "$(debug)" = "no"
- @test "$(optimize)" = "yes" || test "$(optimize)" = "no"
- @test "$(SUPPORTED_ARCH)" = "true"
- @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
- test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || test "$(arch)" = "e2k" || \
- test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" || test "$(arch)" = "riscv64" || test "$(arch)" = "loongarch64"
- @test "$(bits)" = "32" || test "$(bits)" = "64"
- @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
- @test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
- @test "$(pext)" = "yes" || test "$(pext)" = "no"
- @test "$(sse)" = "yes" || test "$(sse)" = "no"
- @test "$(mmx)" = "yes" || test "$(mmx)" = "no"
- @test "$(sse2)" = "yes" || test "$(sse2)" = "no"
- @test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
- @test "$(sse41)" = "yes" || test "$(sse41)" = "no"
- @test "$(avx2)" = "yes" || test "$(avx2)" = "no"
- @test "$(avx512)" = "yes" || test "$(avx512)" = "no"
- @test "$(vnni256)" = "yes" || test "$(vnni256)" = "no"
- @test "$(vnni512)" = "yes" || test "$(vnni512)" = "no"
- @test "$(neon)" = "yes" || test "$(neon)" = "no"
- @test "$(comp)" = "gcc" || test "$(comp)" = "icx" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \
- || test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang"
-
$(EXE): $(OBJS)
- +$(CXX) -o $@ $(OBJS) $(LDFLAGS)
+ +$(CXX) $(SF_LDFLAGS) -o $@ $(OBJS) $(SF_LIBS:%=-l%)
+
+%.o: %.cpp
+ +$(CXX) $(SF_CXXFLAGS) -c -o $@ $<
# Force recompilation to ensure version info is up-to-date
misc.o: FORCE
FORCE:
-clang-profile-make:
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
- EXTRACXXFLAGS='-fprofile-generate ' \
- EXTRALDFLAGS=' -fprofile-generate' \
- all
-
-clang-profile-use:
- $(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
- EXTRACXXFLAGS='-fprofile-use=stockfish.profdata' \
- EXTRALDFLAGS='-fprofile-use ' \
- all
+.depend: $(SRCS)
+ -@$(CXX) $(SF_CXXFLAGS) -MM $(SRCS) > $@ && \
+ printf "%s\n\n" "Dependency updated, restarting Make..."
gcc-profile-make:
@mkdir -p profdir
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
- EXTRACXXFLAGS='-fprofile-generate=profdir' \
- EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \
- EXTRALDFLAGS='-lgcov' \
- all
+ @$(MAKE) --no-print-directory \
+ CXXFLAGS="-fprofile-generate=profdir" LDFLAGS="" LIBS="gcov" all
gcc-profile-use:
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
- EXTRACXXFLAGS='-fprofile-use=profdir -fno-peel-loops -fno-tracer' \
- EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \
- EXTRALDFLAGS='-lgcov' \
- all
+ @$(MAKE) --no-print-directory \
+ CXXFLAGS="-fprofile-use=profdir -fno-peel-loops -fno-tracer" LDFLAGS="" LIBS="gcov" all
+
+clang-profile-make:
+ @$(MAKE) --no-print-directory \
+ CXXFLAGS="-fprofile-generate" LDFLAGS="-fprofile-generate" all
+
+clang-profile-use:
+ $(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
+ @$(MAKE) --no-print-directory \
+ CXXFLAGS="-fprofile-use=stockfish.profdata" \
+ LDFLAGS="-fprofile-use=stockfish.profdata" \
+ all
icx-profile-make:
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
- EXTRACXXFLAGS='-fprofile-instr-generate ' \
- EXTRALDFLAGS=' -fprofile-instr-generate' \
- all
+ @$(MAKE) --no-print-directory \
+ CXXFLAGS="-fprofile-instr-generate" LDFLAGS="-fprofile-instr-generate" all
icx-profile-use:
- $(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
- $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
- EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \
- EXTRALDFLAGS='-fprofile-use ' \
- all
+ @$(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
+ @$(MAKE) --no-print-directory \
+ CXXFLAGS="-fprofile-instr-use=stockfish.profdata" LDFLAGS="-fprofile-use" all
-.depend: $(SRCS)
- -@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null
-
-ifeq (, $(filter $(MAKECMDGOALS), help strip install clean net objclean profileclean config-sanity))
+ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)
-include .depend
-endif
+endif
diff --git a/src/arch/.clang-format b/src/arch/.clang-format
new file mode 100644
index 00000000000..50b996b1a8a
--- /dev/null
+++ b/src/arch/.clang-format
@@ -0,0 +1,5 @@
+BasedOnStyle: InheritParentConfig
+
+# Architecture specific files use a lot of preprocessor directives.
+# Do not indent them for better readability.
+IndentPPDirectives: None
diff --git a/src/arch/arm/Makefile b/src/arch/arm/Makefile
new file mode 100644
index 00000000000..6f33d3e1576
--- /dev/null
+++ b/src/arch/arm/Makefile
@@ -0,0 +1,43 @@
+define HELP_STRING_ARM
+To build Stockfish, run the following command:
+
+make [ARCH=] [CPU=]
+
+Build presets for ARM/AArch64 architecture:
+
+armv8-dotprod > ARMv8.2-A 64-bit, Neon with DotProd feature
+armv8 > ARMv8-A 64-bit, Neon (Advanced SIMD)
+armv7-neon > ARMv7-A 32-bit, Neon (Advanced SIMD)
+armv7 > ARMv7-A 32-bit, no SIMD support
+
+Stockfish does not support AArch32 targets.
+Stockfish does not support non-A profile architectures.
+
+endef
+export HELP_STRING_ARM
+
+ifneq ($(filter $(COMP),gcc clang icx),)
+
+ifeq ($(ARCH),armv8-dotprod)
+ SF_CXXFLAGS += -march=armv8.2-a+dotprod
+else ifeq ($(ARCH),armv8)
+ SF_CXXFLAGS += -march=armv8-a
+else ifeq ($(ARCH),armv7-neon)
+ SF_CXXFLAGS += -march=armv7-a -mfpu=neon -mfloat-abi=softfp
+else ifeq ($(ARCH),armv7)
+ SF_CXXFLAGS += -march=armv7-a
+endif
+
+endif # gcc clang icx
+
+.PHONY: help-arch config-sanity-arch
+
+help-arch:
+ @echo "$${HELP_STRING_ARM}"
+
+config-sanity-arch:
+ @[ "$(ARCH)" = "armv8-dotprod" -o \
+ "$(ARCH)" = "armv8" -o \
+ "$(ARCH)" = "armv7-neon" -o \
+ "$(ARCH)" = "armv7" \
+ ]
diff --git a/src/arch/arm/arch.h b/src/arch/arm/arch.h
new file mode 100644
index 00000000000..30f50451aaf
--- /dev/null
+++ b/src/arch/arm/arch.h
@@ -0,0 +1,119 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef ARM_ARCH_H_INCLUDED
+#define ARM_ARCH_H_INCLUDED
+
+#if !defined(__arm__) && !defined(__aarch64__) && __ARM_ARCH_PROFILE != 'A'
+#error "Not supported in the current architecture."
+#endif
+
+#if __ARM_ARCH >= 8 && (!defined(__ARM_64BIT_STATE) || !defined(__ARM_NEON))
+#error "Invalid AArch64 state."
+#endif
+
+#include
+#include
+#include
+
+#include "common.h"
+
+#include
+
+#ifdef __ARM_NEON
+
+#include
+
+namespace Stockfish {
+
+template
+inline int __neon_cnt(T n) {
+ static_assert(std::is_integral_v && sizeof(T) <= 8);
+
+ uint8x8_t cnt = vcnt_u8(vcreate_u8(std::uint64_t(n)));
+
+#if __ARM_ARCH >= 8
+ return vaddv_u8(cnt);
+#else
+ return vget_lane_u64(vpaddl_u32(vpaddl_u16(vpaddl_u8(cnt))), 0);
+#endif
+}
+
+inline void vdotq_s32_v(int32x4_t& acc, int8x16_t in, int8x16_t col) {
+#ifdef __ARM_FEATURE_DOTPROD
+ acc = vdotq_s32(acc, in, col);
+#elif __ARM_ARCH >= 8
+ int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
+ int16x8_t product1 = vmull_high_s8(in, col);
+ int16x8_t sum = vpaddq_s16(product0, product1);
+ acc = vpadalq_s16(acc, sum);
+#else
+ int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
+ int16x8_t product1 = vmull_s8(vget_high_s8(in), vget_high_s8(col));
+ int16x8_t sum =
+ vcombine_s16(vqmovn_s32(vpaddlq_s16(product0)), vqmovn_s32(vpaddlq_s16(product1)));
+ acc = vpadalq_s16(acc, sum);
+#endif
+}
+
+} // namespace Stockfish
+
+#endif // __ARM_NEON
+
+namespace Stockfish {
+
+inline constexpr bool ArchImpl::Is64Bit = __ARM_ARCH >= 8;
+inline constexpr bool ArchImpl::UsePEXT = false;
+
+template
+inline void ArchImpl::prefetch([[maybe_unused]] const void* m) {}
+
+template
+inline unsigned int ArchImpl::popcount(T n) {
+ static_assert(std::is_integral_v && sizeof(T) <= 8);
+
+#ifdef __ARM_NEON
+ return __neon_cnt(n);
+#else
+ return __popcount_value(n);
+#endif
+}
+
+template
+inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) {
+ return 0;
+}
+
+// ===========================================================================
+// The functions below are used on ARM/AArch64 targets only.
+// ===========================================================================
+
+template
+inline int ctz(T n) {
+ static_assert(std::is_integral_v && (sizeof(T) == 4 || sizeof(T) == 8));
+ assert(n != 0);
+
+ if constexpr (sizeof(T) == 8)
+ return __clzll(__rbitll(std::uint64_t(n)));
+ else
+ return __clz(__rbit(std::uint32_t(n)));
+}
+
+} // namespace Stockfish
+
+#endif // ARM_ARCH_H_INCLUDED
diff --git a/src/arch/arm/nnue/layers/affine_transform.h b/src/arch/arm/nnue/layers/affine_transform.h
new file mode 100644
index 00000000000..3fb69bce2d8
--- /dev/null
+++ b/src/arch/arm/nnue/layers/affine_transform.h
@@ -0,0 +1,106 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#define ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check ARM/AArch64 SIMD features.
+// If none is defined, fall back to the generic implementation.
+#ifndef __ARM_NEON
+
+#include "arch/generic/nnue/layers/affine_transform.h"
+
+#else
+
+#include "../../arch.h"
+
+#include
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template
+constexpr IndexType AffineTransform::get_weight_index(IndexType i) {
+ return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4
+ + i / PaddedInputDimensions * 4 + i % 4;
+}
+
+template
+void AffineTransform::propagate(const InputType* input, OutputType* output) const {
+ if constexpr (OutputDimensions > 1)
+ {
+ static constexpr IndexType OutputLanes = 16 / sizeof(OutputType);
+ static_assert(OutputDimensions % OutputLanes == 0);
+
+ static constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4;
+ static constexpr IndexType NumRegs = OutputDimensions / OutputLanes;
+
+ int32x4_t acc[NumRegs];
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ acc[k] = reinterpret_cast(biases)[k];
+
+ for (IndexType i = 0; i < NumChunks; ++i)
+ {
+ const int8x16_t in =
+ vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast(input)[i]));
+ const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]);
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ vdotq_s32_v(acc[k], in, col[k]);
+ }
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ reinterpret_cast(output)[k] = acc[k];
+ }
+ else if constexpr (OutputDimensions == 1)
+ {
+ static constexpr IndexType InputLanes = 16 / sizeof(InputType);
+ static_assert(PaddedInputDimensions % InputLanes == 0);
+
+ static constexpr IndexType NumChunks = PaddedInputDimensions / InputLanes;
+
+ int32x4_t sum = vdupq_n_s32(0);
+
+ for (IndexType j = 0; j < NumChunks; ++j)
+ {
+ const int8x16_t in = reinterpret_cast(input)[j];
+ const int8x16_t row = reinterpret_cast(weights)[j];
+ vdotq_s32_v(sum, in, row);
+ }
+
+#if __ARM_ARCH >= 8
+ output[0] = vaddvq_s32(sum) + biases[0];
+#else
+ output[0] = vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2)
+ + vgetq_lane_s32(sum, 3) + biases[0];
+#endif
+ }
+}
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // !__ARM_NEON
+
+#endif // ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/arch/arm/nnue/layers/affine_transform_sparse_input.h b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h
new file mode 100644
index 00000000000..546002c256f
--- /dev/null
+++ b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h
@@ -0,0 +1,129 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+#define ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include "../../arch.h"
+
+#include
+#include
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+#if __ARM_ARCH >= 8
+
+alignas(CacheLineSize) static const std::array, 256> lookupIndices =
+ [] {
+ std::array, 256> array{};
+ for (std::uint64_t i = 0; i < 256; ++i)
+ {
+ std::uint64_t j = i, k = 0;
+ while (j)
+ array[i][k++] = ctz(j), j &= j - 1;
+ }
+ return array;
+ }();
+
+template
+class AffineTransformSparseInput: public AffineTransform {
+ __DEFINE_BASE_PROPERTIES
+
+ static_assert(OutputDimensions % 16 == 0,
+ "OutputDimensions must be multiple of 16 for this layer.");
+
+ public:
+ void propagate(const InputType* input, OutputType* output) const;
+
+ private:
+ template
+ static IndexType populate_nz_indices(const std::uint8_t* input, std::uint16_t* indices) {
+ IndexType count = 0;
+ uint16x8_t base = vdupq_n_u16(0);
+
+ const auto in = reinterpret_cast(input);
+
+ for (IndexType i = 0; i < InputDimensions / 8; ++i)
+ {
+ const int32x4_t chunk0 = in[i * 2];
+ const int32x4_t chunk1 = in[i * 2 + 1];
+
+ static const uint32x4_t movemask = [] {
+ const std::uint32_t n[4] = {1, 2, 4, 8};
+ return vld1q_u32(n);
+ }();
+
+ const std::uint32_t nnz = vaddvq_u32(vandq_u32(vtstq_s32(chunk0, chunk0), movemask))
+ | vaddvq_u32(vandq_u32(vtstq_s32(chunk1, chunk1), movemask))
+ << 4;
+ const uint16x8_t offsets = *reinterpret_cast(&lookupIndices[nnz]);
+ *reinterpret_cast(indices + count) = vaddq_u16(base, offsets);
+ count += popcount(nnz);
+ base = vaddq_u16(base, vdupq_n_u16(8));
+ }
+
+ return count;
+ }
+};
+
+template
+void AffineTransformSparseInput::propagate(const InputType* input,
+ OutputType* output) const {
+ static constexpr IndexType OutputLanes = 16 / sizeof(OutputType);
+
+ static constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4;
+ static constexpr IndexType NumRegs = OutputDimensions / OutputLanes;
+
+ int32x4_t acc[NumRegs];
+ std::uint16_t nnz[NumChunks];
+ IndexType count = populate_nz_indices(input, nnz);
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ acc[k] = reinterpret_cast(biases)[k];
+
+ for (IndexType j = 0; j < count; ++j)
+ {
+ const auto i = nnz[j];
+ const int8x16_t in =
+ vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast(input)[i]));
+ const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]);
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ vdotq_s32_v(acc[k], in, col[k]);
+ }
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ reinterpret_cast(output)[k] = acc[k];
+}
+
+#else
+
+template
+using AffineTransformSparseInput = AffineTransform;
+
+#endif // __ARM_ARCH >= 8
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
diff --git a/src/arch/arm/nnue/layers/clipped_relu.h b/src/arch/arm/nnue/layers/clipped_relu.h
new file mode 100644
index 00000000000..7e644ec5060
--- /dev/null
+++ b/src/arch/arm/nnue/layers/clipped_relu.h
@@ -0,0 +1,72 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#define ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check ARM/AArch64 SIMD features.
+// If none is defined, fall back to the generic implementation.
+#ifndef __ARM_NEON
+
+#include "arch/generic/nnue/layers/clipped_relu.h"
+
+#else
+
+#include "../../arch.h"
+
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template
+void ClippedReLU::propagate(const InputType* input, OutputType* output) const {
+ static constexpr IndexType NumChunks = ceil_to_multiple(OutputDimensions, 16) / 8;
+
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast(output);
+
+ for (IndexType i = 0; i < NumChunks; ++i)
+ {
+#if __ARM_ARCH >= 8
+ int16x4_t words0 = vqshrn_n_s32(in[i * 2], WeightScaleBits);
+ int16x8_t words = vqshrn_high_n_s32(words0, in[i * 2 + 1], WeightScaleBits);
+ out[i] = vmax_s8(vqmovn_s16(words), vdup_n_s8(0));
+#else
+ union {
+ int16x4x2_t tuple;
+ int16x8_t all;
+ } words;
+
+ words.tuple.val[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
+ words.tuple.val[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
+ out[i] = vmax_s8(vqmovn_s16(words.all), vdup_n_s8(0));
+#endif
+ }
+}
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // !__ARM_NEON
+
+#endif // ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/arm/nnue/layers/sqr_clipped_relu.h b/src/arch/arm/nnue/layers/sqr_clipped_relu.h
new file mode 100644
index 00000000000..f67388c6fac
--- /dev/null
+++ b/src/arch/arm/nnue/layers/sqr_clipped_relu.h
@@ -0,0 +1,29 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#define ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// lazy
+#include "arch/generic/nnue/layers/sqr_clipped_relu.h"
+
+#endif // ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/arm/nnue/nnue_feature_transformer.h b/src/arch/arm/nnue/nnue_feature_transformer.h
new file mode 100644
index 00000000000..fe0b077f008
--- /dev/null
+++ b/src/arch/arm/nnue/nnue_feature_transformer.h
@@ -0,0 +1,343 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#define ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+
+#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check ARM/AArch64 SIMD features.
+// If none is defined, fall back to the generic implementation.
+#ifndef __ARM_NEON
+
+#include "arch/generic/nnue/nnue_feature_transformer.h"
+
+#else
+
+#include "../arch.h"
+
+#include
+#include
+
+#include "misc.h"
+#include "position.h"
+#include "types.h"
+#include "nnue/nnue_accumulator.h"
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE {
+
+template StateInfo::*accPtr>
+struct FeatureTransformer::Details {
+ private:
+ static constexpr int NumQReg = 16;
+
+ public:
+ static constexpr int OptimalAccRegisterCount =
+ optimal_register_count<16, NumQReg, sizeof(WeightType), TransformedFeatureDimensions>();
+ static constexpr int OptimalPSQTRegisterCount =
+ optimal_register_count<16, NumQReg, sizeof(PSQTWeightType), PSQTBuckets>();
+
+ static constexpr IndexType TileHeight = OptimalAccRegisterCount * 16 / sizeof(WeightType);
+ static constexpr IndexType PsqtTileHeight =
+ OptimalPSQTRegisterCount * 16 / sizeof(PSQTWeightType);
+
+ static_assert(HalfDimensions % TileHeight == 0,
+ "HalfDimensions must be multiple of TileHeight");
+ static_assert(PSQTBuckets % PsqtTileHeight == 0,
+ "PSQTBuckets must be multiple of PsqtTileHeight");
+};
+
+template StateInfo::*accPtr>
+template
+void FeatureTransformer::permute_weights() {}
+
+template StateInfo::*accPtr>
+template
+void FeatureTransformer::
+ apply_accumulator_updates_incremental(StateInfo* computed,
+ StateInfo* next,
+ FeatureSet::IndexList& removed,
+ FeatureSet::IndexList& added) const {
+ // The most common case when updating the accumulator incrementally.
+ // Calculates feature differences directly without using tiling mechanism.
+ if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1)
+ {
+ const auto accIn =
+ reinterpret_cast(&(computed->*accPtr).accumulation[Perspective][0]);
+ const auto accOut =
+ reinterpret_cast(&(next->*accPtr).accumulation[Perspective][0]);
+
+ const IndexType offsetR0 = HalfDimensions * removed[0];
+ const auto columnR0 = reinterpret_cast(&weights[offsetR0]);
+ const IndexType offsetA = HalfDimensions * added[0];
+ const auto columnA = reinterpret_cast(&weights[offsetA]);
+
+ if (removed.size() == 1)
+ {
+ for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / 16; ++i)
+ accOut[i] = vaddq_s16(vsubq_s16(accIn[i], columnR0[i]), columnA[i]);
+ }
+ else
+ {
+ const IndexType offsetR1 = HalfDimensions * removed[1];
+ const auto columnR1 = reinterpret_cast(&weights[offsetR1]);
+
+ for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / 16; ++i)
+ accOut[i] =
+ vsubq_s16(vaddq_s16(accIn[i], columnA[i]), vaddq_s16(columnR0[i], columnR1[i]));
+ }
+
+ const auto accPsqtIn =
+ reinterpret_cast(&(computed->*accPtr).psqtAccumulation[Perspective][0]);
+ const auto accPsqtOut =
+ reinterpret_cast(&(next->*accPtr).psqtAccumulation[Perspective][0]);
+
+ const IndexType offsetPsqtR0 = PSQTBuckets * removed[0];
+ auto columnPsqtR0 = reinterpret_cast(&psqtWeights[offsetPsqtR0]);
+ const IndexType offsetPsqtA = PSQTBuckets * added[0];
+ auto columnPsqtA = reinterpret_cast(&psqtWeights[offsetPsqtA]);
+
+ if (removed.size() == 1)
+ {
+ for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / 16; ++i)
+ accPsqtOut[i] = vaddq_s32(vsubq_s32(accPsqtIn[i], columnPsqtR0[i]), columnPsqtA[i]);
+ }
+ else
+ {
+ const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
+ const auto columnPsqtR1 =
+ reinterpret_cast(&psqtWeights[offsetPsqtR1]);
+
+ for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / 16; ++i)
+ accPsqtOut[i] = vsubq_s32(vaddq_s32(accPsqtIn[i], columnPsqtA[i]),
+ vaddq_s32(columnPsqtR0[i], columnPsqtR1[i]));
+ }
+ }
+ else
+ {
+ int16x8_t acc[Details::OptimalAccRegisterCount];
+
+ for (IndexType i = 0; i < HalfDimensions / Details::TileHeight; ++i)
+ {
+ const IndexType offsetRow = i * Details::TileHeight;
+
+ const auto accTileIn = reinterpret_cast(
+ &(computed->*accPtr).accumulation[Perspective][offsetRow]);
+ for (std::size_t j = 0; j < array_size(acc); ++j)
+ acc[j] = accTileIn[j];
+
+ for (const auto index : removed)
+ {
+ const IndexType offset = HalfDimensions * index + offsetRow;
+ const auto column = reinterpret_cast(&weights[offset]);
+ for (std::size_t j = 0; j < array_size(acc); ++j)
+ acc[j] = vsubq_s16(acc[j], column[j]);
+ }
+
+ for (const auto index : added)
+ {
+ const IndexType offset = HalfDimensions * index + offsetRow;
+ const auto column = reinterpret_cast(&weights[offset]);
+ for (std::size_t j = 0; j < array_size(acc); ++j)
+ acc[j] = vaddq_s16(acc[j], column[j]);
+ }
+
+ const auto accTileOut =
+ reinterpret_cast(&(next->*accPtr).accumulation[Perspective][offsetRow]);
+ for (std::size_t j = 0; j < array_size(acc); ++j)
+ accTileOut[j] = acc[j];
+ }
+
+ int32x4_t psqt[Details::OptimalPSQTRegisterCount];
+
+ for (IndexType i = 0; i < PSQTBuckets / Details::PsqtTileHeight; ++i)
+ {
+ const IndexType offsetRow = i * Details::PsqtTileHeight;
+
+ auto accTilePsqtIn = reinterpret_cast(
+ &(computed->*accPtr).psqtAccumulation[Perspective][offsetRow]);
+ for (std::size_t j = 0; j < array_size(psqt); ++j)
+ psqt[j] = accTilePsqtIn[j];
+
+ for (const auto index : removed)
+ {
+ const IndexType offset = PSQTBuckets * index + offsetRow;
+ auto columnPsqt = reinterpret_cast(&psqtWeights[offset]);
+ for (std::size_t j = 0; j < array_size(psqt); ++j)
+ psqt[j] = vsubq_s32(psqt[j], columnPsqt[j]);
+ }
+
+ for (const auto index : added)
+ {
+ const IndexType offset = PSQTBuckets * index + offsetRow;
+ auto columnPsqt = reinterpret_cast(&psqtWeights[offset]);
+ for (std::size_t j = 0; j < array_size(psqt); ++j)
+ psqt[j] = vaddq_s32(psqt[j], columnPsqt[j]);
+ }
+
+ auto accTilePsqtOut = reinterpret_cast(
+ &(next->*accPtr).psqtAccumulation[Perspective][offsetRow]);
+ for (std::size_t j = 0; j < array_size(psqt); ++j)
+ accTilePsqtOut[j] = psqt[j];
+ }
+ }
+}
+
+template StateInfo::*accPtr>
+template
+void FeatureTransformer::
+ apply_accumulator_updates_refresh_cache(
+ Accumulator& accumulator,
+ typename AccumulatorCaches::Cache::Entry& entry,
+ FeatureSet::IndexList removed,
+ FeatureSet::IndexList added) const {
+ int16x8_t acc[Details::OptimalAccRegisterCount];
+
+ for (IndexType j = 0; j < HalfDimensions / Details::TileHeight; ++j)
+ {
+ const IndexType offsetRow = j * Details::TileHeight;
+
+ const auto accTile =
+ reinterpret_cast(&accumulator.accumulation[Perspective][offsetRow]);
+ const auto entryTile = reinterpret_cast(&entry.accumulation[offsetRow]);
+
+ for (IndexType k = 0; k < array_size(acc); ++k)
+ acc[k] = entryTile[k];
+
+ std::size_t i = 0;
+ for (; i < std::min(removed.size(), added.size()); ++i)
+ {
+ const IndexType offsetR = HalfDimensions * removed[i] + offsetRow;
+ const auto columnR = reinterpret_cast(&weights[offsetR]);
+ const IndexType offsetA = HalfDimensions * added[i] + offsetRow;
+ const auto columnA = reinterpret_cast(&weights[offsetA]);
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ acc[k] = vaddq_s16(acc[k], vsubq_s16(columnA[k], columnR[k]));
+ }
+ for (; i < removed.size(); ++i)
+ {
+ const IndexType offset = HalfDimensions * removed[i] + offsetRow;
+ const auto column = reinterpret_cast(&weights[offset]);
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ acc[k] = vsubq_s16(acc[k], column[k]);
+ }
+ for (; i < added.size(); ++i)
+ {
+ const IndexType offset = HalfDimensions * added[i] + offsetRow;
+ const auto column = reinterpret_cast(&weights[offset]);
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ acc[k] = vaddq_s16(acc[k], column[k]);
+ }
+
+ for (std::size_t k = 0; k < array_size(acc); k++)
+ entryTile[k] = acc[k];
+ for (std::size_t k = 0; k < array_size(acc); k++)
+ accTile[k] = acc[k];
+ }
+
+ int32x4_t psqt[Details::OptimalPSQTRegisterCount];
+
+ for (IndexType j = 0; j < PSQTBuckets / Details::PsqtTileHeight; ++j)
+ {
+ const IndexType offsetRow = j * Details::PsqtTileHeight;
+
+ const auto accTilePsqt =
+ reinterpret_cast(&accumulator.psqtAccumulation[Perspective][offsetRow]);
+ const auto entryTilePsqt = reinterpret_cast(&entry.psqtAccumulation[offsetRow]);
+
+ for (std::size_t k = 0; k < array_size(psqt); ++k)
+ psqt[k] = entryTilePsqt[k];
+
+ for (std::size_t i = 0; i < removed.size(); ++i)
+ {
+ const IndexType offset = PSQTBuckets * removed[i] + offsetRow;
+ const auto columnPsqt = reinterpret_cast(&psqtWeights[offset]);
+
+ for (std::size_t k = 0; k < array_size(psqt); ++k)
+ psqt[k] = vsubq_s32(psqt[k], columnPsqt[k]);
+ }
+ for (std::size_t i = 0; i < added.size(); ++i)
+ {
+ const IndexType offset = PSQTBuckets * added[i] + offsetRow;
+ const auto columnPsqt = reinterpret_cast(&psqtWeights[offset]);
+
+ for (std::size_t k = 0; k < array_size(psqt); ++k)
+ psqt[k] = vaddq_s32(psqt[k], columnPsqt[k]);
+ }
+
+ for (std::size_t k = 0; k < array_size(psqt); ++k)
+ entryTilePsqt[k] = psqt[k];
+ for (std::size_t k = 0; k < array_size(psqt); ++k)
+ accTilePsqt[k] = psqt[k];
+ }
+}
+
+template StateInfo::*accPtr>
+void FeatureTransformer::convert_accumulators(
+ const Position& pos, OutputType* output) const {
+ static constexpr IndexType OutputChunkSize = 16 / sizeof(OutputType);
+ static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+
+ static constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
+
+ const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+ const auto& accumulation = (pos.state()->*accPtr).accumulation;
+
+ for (IndexType p = 0; p < 2; ++p)
+ {
+ const auto in0 = reinterpret_cast(&(accumulation[perspectives[p]][0]));
+ const auto in1 =
+ reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+ const auto out = reinterpret_cast(&output[(HalfDimensions / 2) * p]);
+
+ for (IndexType j = 0; j < NumOutputChunks; ++j)
+ {
+ static const int16x8_t Zeroes = vdupq_n_s16(0);
+ static const int16x8_t Ones = vdupq_n_s16(127 * 2);
+
+ const int16x8_t sum0a =
+ vshlq_n_s16(vmaxq_s16(vminq_s16(in0[j * 2 + 0], Ones), Zeroes), 6);
+ const int16x8_t sum0b =
+ vshlq_n_s16(vmaxq_s16(vminq_s16(in0[j * 2 + 1], Ones), Zeroes), 6);
+ const int16x8_t sum1a = vminq_s16(in1[j * 2 + 0], Ones);
+ const int16x8_t sum1b = vminq_s16(in1[j * 2 + 1], Ones);
+
+ const int16x8_t pa = vqdmulhq_s16(sum0a, sum1a);
+ const int16x8_t pb = vqdmulhq_s16(sum0b, sum1b);
+
+ out[j] = vcombine_u8(vqmovun_s16(pa), vqmovun_s16(pb));
+ }
+ }
+}
+
+} // namespace Stockfish::Eval::NNUE
+
+#endif // !__ARM_NEON
+
+#endif // ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/arch/generic/Makefile b/src/arch/generic/Makefile
new file mode 100644
index 00000000000..230916e41b7
--- /dev/null
+++ b/src/arch/generic/Makefile
@@ -0,0 +1,32 @@
+define GENERIC_HELP_STRING
+To build Stockfish, run the following command:
+
+make [ARCH=] [COMP=]
+
+Build presets for other architectures:
+
+general-64 > 64-bit generic
+general-32 > 32-bit generic
+
+To see architecture-specific build options, run 'make help ARCH='.
+Currently supported values: x86, arm, generic
+
+endef
+export GENERIC_HELP_STRING
+
+ifeq ($(COMP),$(filter $(COMP),gcc clang icx))
+
+ifeq ($(ARCH),general-64)
+ CXXFLAGS += -m64
+else ifeq ($(ARCH),general-32)
+ CXXFLAGS += -m32
+endif
+
+endif
+
+help-arch:
+ @echo "$${GENERIC_HELP_STRING}"
+
+config-sanity-arch:
+ @test "$(ARCH)" = "general-64" || \
+ test "$(ARCH)" = "general-32"
diff --git a/src/arch/generic/arch.h b/src/arch/generic/arch.h
new file mode 100644
index 00000000000..3641ab6f3f2
--- /dev/null
+++ b/src/arch/generic/arch.h
@@ -0,0 +1,51 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef GENERIC_ARCH_H_INCLUDED
+#define GENERIC_ARCH_H_INCLUDED
+
+#include
+#include
+
+#include "common.h"
+
+namespace Stockfish {
+
+// There is no practical way to detect the register width, so we assume that
+// it is always 64-bit if address size is 64-bit.
+inline constexpr bool ArchImpl::Is64Bit = sizeof(void*) == 8;
+
+inline constexpr bool ArchImpl::UsePEXT = false;
+
+template
+inline void ArchImpl::prefetch([[maybe_unused]] const void* m) {}
+
+template
+inline unsigned int ArchImpl::popcount(T n) {
+ static_assert(std::is_integral_v && sizeof(T) <= 8);
+ return __popcount_value(n);
+}
+
+template
+inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) {
+ return 0;
+}
+
+} // namespace Stockfish
+
+#endif // GENERIC_ARCH_H_INCLUDED
diff --git a/src/arch/generic/nnue/layers/affine_transform.h b/src/arch/generic/nnue/layers/affine_transform.h
new file mode 100644
index 00000000000..341816d3801
--- /dev/null
+++ b/src/arch/generic/nnue/layers/affine_transform.h
@@ -0,0 +1,59 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#define GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template
+constexpr IndexType AffineTransform::get_weight_index(IndexType i) {
+ return i;
+}
+
+template
+void AffineTransform::propagate(const InputType* input, OutputType* output) const {
+ std::memcpy(output, biases, sizeof(OutputType) * OutputDimensions);
+
+ // Traverse weights in transpose order to take advantage of input sparsity
+ for (IndexType i = 0; i < InputDimensions; ++i)
+ {
+ const InputType in = input[i];
+ if (in)
+ {
+ const WeightType* w = &weights[i];
+ for (IndexType j = 0; j < OutputDimensions; ++j)
+ output[j] += w[j * PaddedInputDimensions] * in;
+ }
+ }
+}
+
+template
+using AffineTransformSparseInput = AffineTransform;
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/arch/generic/nnue/layers/clipped_relu.h b/src/arch/generic/nnue/layers/clipped_relu.h
new file mode 100644
index 00000000000..ba965b9b8e7
--- /dev/null
+++ b/src/arch/generic/nnue/layers/clipped_relu.h
@@ -0,0 +1,40 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#define GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template
+void ClippedReLU::propagate(const InputType* input, OutputType* output) const {
+ for (IndexType i = 0; i < InputDimensions; ++i)
+ output[i] = static_cast(std::clamp(input[i] >> WeightScaleBits, 0, 127));
+}
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/generic/nnue/layers/sqr_clipped_relu.h b/src/arch/generic/nnue/layers/sqr_clipped_relu.h
new file mode 100644
index 00000000000..5048df5406a
--- /dev/null
+++ b/src/arch/generic/nnue/layers/sqr_clipped_relu.h
@@ -0,0 +1,45 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#define GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template
+void SqrClippedReLU::propagate(const InputType* input, OutputType* output) const {
+ // The correct formula is to divide by 127, but we need to make it fast
+ // therefore right-shift by extra 7 bits is used instead. Needs to be
+ // accounted for in the trainer.
+ for (IndexType i = 0; i < InputDimensions; ++i)
+ output[i] = static_cast(std::min(
+ std::int64_t(127), std::int64_t(input[i]) * input[i] >> (2 * WeightScaleBits + 7)));
+}
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/generic/nnue/nnue_feature_transformer.h b/src/arch/generic/nnue/nnue_feature_transformer.h
new file mode 100644
index 00000000000..f885980541a
--- /dev/null
+++ b/src/arch/generic/nnue/nnue_feature_transformer.h
@@ -0,0 +1,146 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#define GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+
+#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include
+#include
+
+#include "position.h"
+#include "nnue/nnue_accumulator.h"
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE {
+
+template StateInfo::*accPtr>
+template
+void FeatureTransformer::permute_weights() {}
+
+template StateInfo::*accPtr>
+template
+void FeatureTransformer::
+ apply_accumulator_updates_incremental(StateInfo* computed,
+ StateInfo* next,
+ FeatureSet::IndexList& removed,
+ FeatureSet::IndexList& added) const {
+
+ std::memcpy((next->*accPtr).accumulation[Perspective],
+ (computed->*accPtr).accumulation[Perspective], HalfDimensions * sizeof(BiasType));
+ std::memcpy((next->*accPtr).psqtAccumulation[Perspective],
+ (computed->*accPtr).psqtAccumulation[Perspective],
+ PSQTBuckets * sizeof(PSQTWeightType));
+
+ // Difference calculation for the deactivated features
+ for (const auto index : removed)
+ {
+ const IndexType wOffset = HalfDimensions * index;
+ const IndexType pwOffset = PSQTBuckets * index;
+
+ for (IndexType i = 0; i < HalfDimensions; ++i)
+ (next->*accPtr).accumulation[Perspective][i] -= weights[wOffset + i];
+
+ for (IndexType i = 0; i < PSQTBuckets; ++i)
+ (next->*accPtr).psqtAccumulation[Perspective][i] -= psqtWeights[pwOffset + i];
+ }
+
+ // Difference calculation for the activated features
+ for (const auto index : added)
+ {
+ const IndexType wOffset = HalfDimensions * index;
+ const IndexType pwOffset = PSQTBuckets * index;
+
+ for (IndexType i = 0; i < HalfDimensions; ++i)
+ (next->*accPtr).accumulation[Perspective][i] += weights[wOffset + i];
+
+ for (IndexType i = 0; i < PSQTBuckets; ++i)
+ (next->*accPtr).psqtAccumulation[Perspective][i] += psqtWeights[pwOffset + i];
+ }
+}
+
+template StateInfo::*accPtr>
+template
+void FeatureTransformer::
+ apply_accumulator_updates_refresh_cache(
+ Accumulator& accumulator,
+ typename AccumulatorCaches::Cache::Entry& entry,
+ FeatureSet::IndexList removed,
+ FeatureSet::IndexList added) const {
+ for (const auto index : removed)
+ {
+ const IndexType wOffset = HalfDimensions * index;
+ const IndexType pwOffset = PSQTBuckets * index;
+
+ for (IndexType j = 0; j < HalfDimensions; ++j)
+ entry.accumulation[j] -= weights[wOffset + j];
+
+ for (IndexType k = 0; k < PSQTBuckets; ++k)
+ entry.psqtAccumulation[k] -= psqtWeights[pwOffset + k];
+ }
+ for (const auto index : added)
+ {
+ const IndexType wOffset = HalfDimensions * index;
+ const IndexType pwOffset = PSQTBuckets * index;
+
+ for (IndexType j = 0; j < HalfDimensions; ++j)
+ entry.accumulation[j] += weights[wOffset + j];
+
+ for (IndexType k = 0; k < PSQTBuckets; ++k)
+ entry.psqtAccumulation[k] += psqtWeights[pwOffset + k];
+ }
+
+ // The accumulator of the refresh entry has been updated.
+ // Now copy its content to the actual accumulator we were refreshing.
+ std::memcpy(accumulator.accumulation[Perspective], entry.accumulation,
+ sizeof(BiasType) * HalfDimensions);
+ std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation,
+ sizeof(PSQTWeightType) * PSQTBuckets);
+}
+
+template StateInfo::*accPtr>
+void FeatureTransformer::convert_accumulators(
+ const Position& pos, OutputType* output) const {
+ const int perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+ const auto& accumulation = (pos.state()->*accPtr).accumulation;
+
+ for (IndexType p = 0; p < 2; ++p)
+ {
+ const IndexType offset = (HalfDimensions / 2) * p;
+
+ for (IndexType j = 0; j < HalfDimensions / 2; ++j)
+ {
+ BiasType sum0 = accumulation[perspectives[p]][j];
+ BiasType sum1 = accumulation[perspectives[p]][j + HalfDimensions / 2];
+ sum0 = std::clamp(sum0, 0, 127 * 2);
+ sum1 = std::clamp(sum1, 0, 127 * 2);
+ output[offset + j] = static_cast(unsigned(sum0 * sum1) / 512);
+ }
+ }
+}
+
+} // namespace Stockfish::Eval::NNUE
+
+#endif // GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/arch/i386/Makefile b/src/arch/i386/Makefile
new file mode 100644
index 00000000000..f9a97330a2a
--- /dev/null
+++ b/src/arch/i386/Makefile
@@ -0,0 +1,116 @@
+define HELP_STRING_I386
+To build Stockfish, run the following command:
+
+make [ARCH=] [NO_AVX512=1]
+
+Build presets for x86 architecture:
+
+x86-64-avx512-vnni > 64-bit, AVX-512 with BW/VL/VNNI
+x86-64-avx512 > 64-bit, AVX-512 with BW/VL
+x86-64-avxvnni > 64-bit, AVX2 with AVX-VNNI
+x86-64-bmi2 > 64-bit, AVX2 and BMI2
+x86-64-avx2 > 64-bit, AVX2
+x86-64-avx > 64-bit, AVX
+x86-64-popcnt > 64-bit, POPCNT with SSE4.2
+x86-64-sse41 > 64-bit, SSE4.1
+x86-64-ssse3 > 64-bit, SSSE3
+x86-64 > 64-bit generic (supports up to SSE2)
+x86-32 > 32-bit generic (supports up to SSE2)
+
+Build presets for x86 architecture, AMD processors:
+
+x86-64-bmi > 64-bit, AVX with BMI
+x86-64-abm > 64-bit, SSE4a with ABM
+
+x86 options:
+
+ NO_AVX512
+
+If NO_AVX512 is set, Stockfish will use 256-bit SIMD instructions instead of
+512-bit ones. This option is only available if AVX-512 is enabled.
+
+endef
+export HELP_STRING_I386
+
+# Compatibility layer for old Makefile presets
+ifeq ($(ARCH),x86-64-vnni512)
+ override ARCH := x86-64-avx512-vnni
+endif
+
+ifeq ($(ARCH),x86-64-vnni256)
+ override ARCH := x86-64-avx512-vnni
+ override NO_AVX512 := 1
+endif
+
+ifneq ($(filter $(COMP),gcc clang icx),)
+
+ifeq ($(ARCH),x86-64-avx512-vnni)
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+ -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavx512f -mavx512bw \
+ -mavx512vl -mavx512vnni
+ ifeq ($(NO_AVX512),1)
+ SF_CXXFLAGS += -mprefer-vector-width=256 -DNO_AVX512
+ endif
+else ifeq ($(ARCH),x86-64-avx512)
+ # Xeon Phi is no longer supported on future compiler versions.
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+ -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavx512f -mavx512bw \
+ -mavx512vl
+ ifeq ($(NO_AVX512),1)
+ SF_CXXFLAGS += -mprefer-vector-width=256 -DNO_AVX512
+ endif
+else ifeq ($(ARCH),x86-64-avxvnni)
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+ -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavxvnni
+else ifeq ($(ARCH),x86-64-bmi2)
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+ -mpopcnt -mavx -mbmi -mavx2 -mbmi2
+else ifeq ($(ARCH),x86-64-avx2)
+ # VIA Eden X4 does not support BMI/BMI2, need a custom profile if required.
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+ -mpopcnt -mavx -mbmi -mavx2
+else ifeq ($(ARCH),x86-64-bmi)
+ # Some AMD processors (AMD family 15h) have BMI1 only.
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+ -mpopcnt -mavx -mbmi
+else ifeq ($(ARCH),x86-64-avx)
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+ -mpopcnt -mavx
+else ifeq ($(ARCH),x86-64-popcnt)
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+ -mpopcnt
+else ifeq ($(ARCH),x86-64-sse41)
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1
+else ifeq ($(ARCH),x86-64-ssse3)
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3
+else ifeq ($(ARCH),x86-64-abm)
+ SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -msse4a -mabm
+else ifeq ($(ARCH),x86-64)
+ # Modern compilers add SSE2 flag by default.
+ SF_CXXFLAGS += -m64
+else ifeq ($(ARCH),x86-32)
+ SF_CXXFLAGS += -m32
+endif
+
+endif # gcc clang icx
+
+.PHONY: help-arch config-sanity-arch
+
+help-arch:
+ @echo "$${HELP_STRING_I386}"
+
+config-sanity-arch:
+ @[ "$(ARCH)" = "x86-64-avx512-vnni" -o \
+ "$(ARCH)" = "x86-64-avx512" -o \
+ "$(ARCH)" = "x86-64-avxvnni" -o \
+ "$(ARCH)" = "x86-64-bmi2" -o \
+ "$(ARCH)" = "x86-64-avx2" -o \
+ "$(ARCH)" = "x86-64-bmi" -o \
+ "$(ARCH)" = "x86-64-avx" -o \
+ "$(ARCH)" = "x86-64-popcnt" -o \
+ "$(ARCH)" = "x86-64-sse41" -o \
+ "$(ARCH)" = "x86-64-ssse3" -o \
+ "$(ARCH)" = "x86-64-abm" -o \
+ "$(ARCH)" = "x86-64" -o \
+ "$(ARCH)" = "x86-32" \
+ ]
diff --git a/src/arch/i386/arch.h b/src/arch/i386/arch.h
new file mode 100644
index 00000000000..1ef79088240
--- /dev/null
+++ b/src/arch/i386/arch.h
@@ -0,0 +1,595 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef I386_ARCH_H_INCLUDED
+#define I386_ARCH_H_INCLUDED
+
+#if !defined(__i386__) && !defined(__amd64__)
+#error "Not supported in the current architecture."
+#endif
+
+#include
+#include
+#include
+
+#include "common.h"
+
+#if defined(__AVX__)
+
+#include
+
+#elif defined(__SSE4_1__)
+
+#include
+
+#elif defined(__SSSE3__)
+
+#include
+
+#elif defined(__SSE2__)
+
+#include
+
+// Some AMD processors with ABM do not support SSSE3/SSE4.1.
+#if defined(__POPCNT__)
+#include
+#endif
+
+#elif defined(__SSE__)
+
+#include
+
+#endif
+
+#ifdef ARCH_NATIVE
+
+// Do not use BMI2 PDEP/PEXT on AMD Zen 1/2.
+#if defined(__BMI2__) && (defined(__znver1__) || defined(__znver2__))
+#define NO_PDEP_PEXT 1
+#endif
+
+// Enable AVX-512 on AMD Zen 5 only.
+#if defined(__AVX512F__) && !defined(__znver5__)
+#define NO_AVX512 1
+#endif
+
+#endif // ARCH_NATIVE
+
+namespace Stockfish {
+
+enum class PrefetchHint {
+ ET0 = 7,
+ T0 = 3,
+ T1 = 2,
+ T2 = 1,
+ NTA = 0
+};
+
+// Register size is equal to address bits.
+inline constexpr bool ArchImpl::Is64Bit = sizeof(void*) == 8;
+
+#if defined(__BMI2__) && !defined(NO_PDEP_PEXT)
+inline constexpr bool ArchImpl::UsePEXT = true;
+#else
+inline constexpr bool ArchImpl::UsePEXT = false;
+#endif
+
+template
+inline void ArchImpl::prefetch([[maybe_unused]] const void* m) {
+#ifdef __SSE__
+ constexpr int __Hint = [] {
+ if constexpr (Hint == -1)
+ return 3;
+ else
+#ifdef __PRFCHW__
+ return (Hint & 0x4) ? 7 : (Hint & 0x3);
+#else
+ return Hint & 0x3;
+#endif
+ }();
+
+ // GCC doesn't comply with Intel Intrinsics Guide and uses enum instead
+ // of int.
+#if STOCKFISH_COMPILER == STOCKFISH_COMPILER_GCC
+ _mm_prefetch(m, [] {
+ if constexpr (__Hint == 7)
+ return _MM_HINT_ET0;
+ else if constexpr (__Hint == 3)
+ return _MM_HINT_T0;
+ else if constexpr (__Hint == 2)
+ return _MM_HINT_T1;
+ else if constexpr (__Hint == 1)
+ return _MM_HINT_T2;
+ else
+ return _MM_HINT_NTA;
+ }());
+#else
+ _mm_prefetch(m, __Hint);
+#endif
+
+#endif // __SSE__
+}
+
+template
+inline unsigned int ArchImpl::popcount(T n) {
+ static_assert(std::is_integral_v && sizeof(T) <= 8);
+
+#ifdef __POPCNT__
+ if constexpr (sizeof(T) == 8)
+ return _mm_popcnt_u64(std::uint64_t(n));
+ else
+ return _mm_popcnt_u32(std::uint32_t(n));
+#else
+ if constexpr (!is_64bit() && sizeof(T) == 8)
+ return __popcount_use_table(n);
+ else
+ return __popcount_value(n);
+#endif
+}
+
+template
+inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) {
+#if defined(__BMI2__) && !defined(NO_PDEP_PEXT)
+ static_assert(std::is_integral_v && (sizeof(T) == 4 || sizeof(T) == 8));
+
+ if constexpr (sizeof(T) == 8)
+ return _pext_u64(std::uint64_t(n), std::uint64_t(mask));
+ else
+ return _pext_u32(std::uint32_t(n), std::uint32_t(mask));
+#else
+ return 0;
+#endif
+}
+
+// ===========================================================================
+// The functions below are used on i386/AMD64 targets only.
+// ===========================================================================
+
+template
+inline std::make_unsigned_t blsr(T n) {
+ static_assert(std::is_integral_v && (sizeof(T) == 4 || sizeof(T) == 8));
+
+#ifdef __BMI__
+ if constexpr (sizeof(T) == 8)
+ return _blsr_u64(std::uint64_t(n));
+ else
+ return _blsr_u32(std::uint32_t(n));
+#else
+ return std::make_unsigned_t(n) & std::make_unsigned_t(n - 1);
+#endif
+}
+
+template
+inline int tzcnt(T n) {
+ static_assert(std::is_integral_v && (sizeof(T) == 4 || sizeof(T) == 8));
+
+#ifdef __BMI__
+ if constexpr (sizeof(T) == 8)
+ return _tzcnt_u64(std::uint64_t(n));
+ else
+ return _tzcnt_u32(std::uint32_t(n));
+#else
+ assert(n != 0);
+
+ if constexpr (sizeof(T) == 8)
+ return __builtin_ctzll(n);
+ else
+ return __builtin_ctz(n);
+#endif
+}
+
+#ifdef __SSE2__
+
+template
+struct is_valid_vector {
+ static constexpr bool value = sizeof(T) == 16
+#ifdef __AVX2__
+ || sizeof(T) == 32
+#endif
+#ifdef __AVX512F__
+ || sizeof(T) == 64
+#endif
+ ;
+};
+
+template
+inline constexpr bool is_valid_vector_v = is_valid_vector::value;
+
+template
+inline T _mm_setzero_v() {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+ return _mm512_setzero_si512();
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+ return _mm256_setzero_si256();
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_setzero_si128();
+}
+
+template
+inline T _mm_set1_epi16_v(std::uint16_t n) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+ return _mm512_set1_epi16(n);
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+ return _mm256_set1_epi16(n);
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_set1_epi16(n);
+}
+
+template
+inline T _mm_set1_epi32_v(std::uint32_t n) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+ return _mm512_set1_epi32(n);
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+ return _mm256_set1_epi32(n);
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_set1_epi32(n);
+}
+
+template
+inline T _mm_packus_epi16_v(T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+ return _mm512_packus_epi16(a, b);
+#else
+ static_assert(false, "_mm_packus_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_packus_epi16(a, b);
+#else
+ static_assert(false, "_mm_packus_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_packus_epi16(a, b);
+}
+
+template
+inline T _mm_add_epi16_v(T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+ return _mm512_add_epi16(a, b);
+#else
+ static_assert(false, "_mm_add_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_add_epi16(a, b);
+#else
+ static_assert(false, "_mm_add_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_add_epi16(a, b);
+}
+
+template
+inline T _mm_add_epi32_v(T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+ return _mm512_add_epi32(a, b);
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_add_epi32(a, b);
+#else
+ static_assert(false, "_mm_add_epi32_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_add_epi32(a, b);
+}
+
+template
+inline T _mm_sub_epi16_v(T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+ return _mm512_sub_epi16(a, b);
+#else
+ static_assert(false, "_mm_sub_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_sub_epi16(a, b);
+#else
+ static_assert(false, "_mm_sub_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_sub_epi16(a, b);
+}
+
+template
+inline T _mm_sub_epi32_v(T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+ return _mm512_sub_epi32(a, b);
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_sub_epi32(a, b);
+#else
+ static_assert(false, "_mm_sub_epi32_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_sub_epi32(a, b);
+}
+
+template
+inline T _mm_mulhi_epi16_v(T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+ return _mm512_mulhi_epi16(a, b);
+#else
+ static_assert(false, "vmulhi_16<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_mulhi_epi16(a, b);
+#else
+ static_assert(false, "vmulhi_16<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_mulhi_epi16(a, b);
+}
+
+template
+inline T _mm_slli_epi16_v(T a, int n) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+ return _mm512_slli_epi16(a, n);
+#else
+ static_assert(false, "_mm_slli_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_slli_epi16(a, n);
+#else
+ static_assert(false, "_mm_slli_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_slli_epi16(a, n);
+}
+
+template
+inline T _mm_max_epi16_v(T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+ return _mm512_max_epi16(a, b);
+#else
+ static_assert(false, "_mm_max_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_max_epi16(a, b);
+#else
+ static_assert(false, "_mm_max_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_max_epi16(a, b);
+}
+
+template
+inline T _mm_min_epi16_v(T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+ return _mm512_min_epi16(a, b);
+#else
+ static_assert(false, "vmin_16<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+ return _mm256_min_epi16(a, b);
+#else
+ static_assert(false, "vmin_16<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ return _mm_min_epi16(a, b);
+}
+
+template
+inline std::int32_t _mm_reduce_add_epi32_v(T a) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+ return _mm512_reduce_add_epi32(a);
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+ {
+ __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
+ sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_PERM_BADC));
+ sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_PERM_CDAB));
+ return _mm_cvtsi128_si32(sum);
+ }
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ {
+ a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0x4E)); // _MM_PERM_BADC
+ a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0xB1)); // _MM_PERM_CDAB
+ return _mm_cvtsi128_si32(a);
+ }
+}
+
+// Non-VNNI implementation of dpbusd works even with type saturation, only
+// because output values are clamped in ReLU layers immediately after
+// AffineTransform layer. Do not use this without VNNI for general purpose.
+template
+inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) {
+ static_assert(is_valid_vector_v);
+
+#ifdef __AVX512F__
+ if constexpr (sizeof(T) == 64)
+ {
+#if defined(__AVX512VNNI__)
+
+ acc = _mm512_dpbusd_epi32(acc, a, b);
+
+#elif defined(__AVX512BW__)
+
+ __m512i product = _mm512_maddubs_epi16(a, b);
+ product = _mm512_madd_epi16(product, _mm512_set1_epi16(1));
+ acc = _mm512_add_epi32(acc, product);
+
+#else
+ static_assert(false, "_mm_dpbusd_epi32_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+ }
+#endif
+
+#ifdef __AVX__
+ if constexpr (sizeof(T) == 32)
+ {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+
+ acc = _mm256_dpbusd_epi32(acc, a, b);
+
+#elif defined(__AVX2__)
+
+ __m256i product = _mm256_madd_epi16(_mm256_maddubs_epi16(a, b), _mm256_set1_epi16(1));
+ acc = _mm256_add_epi32(acc, product);
+
+#else
+ static_assert(false, "_mm_dpbusd_epi32_v<__m256i> is not allowed without AVX2.");
+#endif
+ }
+#endif
+
+ if constexpr (sizeof(T) == 16)
+ {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+
+ acc = _mm_dpbusd_epi32(acc, a, b);
+
+#elif defined(__SSSE3__)
+
+ __m128i product = _mm_madd_epi16(_mm_maddubs_epi16(a, b), _mm_set1_epi16(1));
+ acc = _mm_add_epi32(acc, product);
+
+#else
+
+ __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+ __m128i a1 = _mm_unpackhi_epi8(a, _mm_setzero_si128());
+ __m128i sgn = _mm_cmplt_epi8(b, _mm_setzero_si128());
+ __m128i b0 = _mm_unpacklo_epi8(b, sgn);
+ __m128i b1 = _mm_unpackhi_epi8(b, sgn);
+ __m128i product0 = _mm_madd_epi16(a0, b0);
+ __m128i product1 = _mm_madd_epi16(a1, b1);
+ __m128i product = _mm_madd_epi16(_mm_packs_epi32(product0, product1), _mm_set1_epi16(1));
+ acc = _mm_add_epi32(acc, product);
+
+#endif
+ }
+}
+
+#endif // __SSE2__
+
+} // namespace Stockfish
+
+#endif // I386_ARCH_H_INCLUDED
diff --git a/src/arch/i386/nnue/layers/affine_transform.h b/src/arch/i386/nnue/layers/affine_transform.h
new file mode 100644
index 00000000000..9155089a16a
--- /dev/null
+++ b/src/arch/i386/nnue/layers/affine_transform.h
@@ -0,0 +1,118 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#define I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check x86/AMD64 SIMD extensions.
+// If none is defined, fall back to the generic implementation.
+#ifndef __SSE2__
+
+#include "arch/generic/nnue/layers/affine_transform.h"
+
+#else
+
+#include "../../arch.h"
+
+#include
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template
+constexpr IndexType AffineTransform::get_weight_index(IndexType i) {
+ return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4
+ + i / PaddedInputDimensions * 4 + i % 4;
+}
+
+template
+void AffineTransform::propagate(const InputType* input, OutputType* output) const {
+ if constexpr (OutputDimensions > 1)
+ {
+#if defined(__AVX512F__) && (defined(__AVX512BW__) || defined(__AVX512VNNI__)) \
+ && !defined(NO_AVX512)
+ using vec_t = __m512i;
+#elif defined(__AVX2__)
+ using vec_t = __m256i;
+#else
+ using vec_t = __m128i;
+#endif
+
+ static constexpr IndexType OutputLanes = sizeof(vec_t) / sizeof(OutputType);
+ static_assert(OutputDimensions % OutputLanes == 0);
+
+ static constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4;
+ static constexpr IndexType NumRegs = OutputDimensions / OutputLanes;
+
+ vec_t acc[NumRegs];
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ acc[k] = reinterpret_cast(biases)[k];
+
+ for (IndexType i = 0; i < NumChunks; ++i)
+ {
+ const vec_t in =
+ _mm_set1_epi32_v(reinterpret_cast(input)[i]);
+ const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]);
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ _mm_dpbusd_epi32_v(acc[k], in, col[k]);
+ }
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ reinterpret_cast(output)[k] = acc[k];
+ }
+ else if constexpr (OutputDimensions == 1)
+ {
+ // We cannot use AVX512 for the last layer because there are only
+ // 32 inputs and the buffer is not padded to 64 elements.
+#if defined(__AVX2__)
+ using vec_t = __m256i;
+#else
+ using vec_t = __m128i;
+#endif
+
+ static constexpr IndexType InputLanes = sizeof(vec_t) / sizeof(InputType);
+ static_assert(PaddedInputDimensions % InputLanes == 0);
+
+ static constexpr IndexType NumChunks = PaddedInputDimensions / InputLanes;
+
+ vec_t sum = _mm_setzero_v();
+
+ for (IndexType j = 0; j < NumChunks; ++j)
+ {
+ const vec_t in = reinterpret_cast(input)[j];
+ const vec_t row = reinterpret_cast(weights)[j];
+ _mm_dpbusd_epi32_v(sum, in, row);
+ }
+
+ output[0] = _mm_reduce_add_epi32_v(sum) + biases[0];
+ }
+}
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // !__SSE2__
+
+#endif // I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/arch/i386/nnue/layers/affine_transform_sparse_input.h b/src/arch/i386/nnue/layers/affine_transform_sparse_input.h
new file mode 100644
index 00000000000..dbc20f704ad
--- /dev/null
+++ b/src/arch/i386/nnue/layers/affine_transform_sparse_input.h
@@ -0,0 +1,168 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+#define I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include "../../arch.h"
+
+#include
+#include
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+#ifdef __SSSE3__
+
+alignas(CacheLineSize) static const std::array, 256> lookupIndices =
+ [] {
+ std::array, 256> array{};
+ for (std::uint64_t i = 0; i < 256; ++i)
+ {
+ std::uint64_t j = i, k = 0;
+ while (j)
+ array[i][k++] = tzcnt(j), j = blsr(j);
+ }
+ return array;
+ }();
+
+template
+class AffineTransformSparseInput: public AffineTransform {
+ __DEFINE_BASE_PROPERTIES
+
+ static_assert(OutputDimensions % 16 == 0,
+ "OutputDimensions must be multiple of 16 for this layer.");
+
+ public:
+ void propagate(const InputType* input, OutputType* output) const;
+
+ private:
+ template
+ static IndexType populate_nz_indices(const std::uint8_t* input, std::uint16_t* indices) {
+#if defined(__AVX512F__) && !defined(NO_AVX512)
+ using vec_t = __m512i;
+#elif defined(__AVX__)
+ using vec_t = __m256i;
+#else
+ using vec_t = __m128i;
+#endif
+
+ static constexpr IndexType InputLanes = sizeof(vec_t) / 4;
+
+ // Inputs are processed InputLanes at a time and outputs are processed
+ // 8 at a time so we process in chunks of max(InputLanes, 8).
+ static constexpr IndexType ChunkSize = std::max(InputLanes, 8);
+ static constexpr IndexType NumChunks = InputDimensions / ChunkSize;
+ static constexpr IndexType InputsPerChunk = ChunkSize / InputLanes;
+ static constexpr IndexType OutputsPerChunk = ChunkSize / 8;
+
+ IndexType count = 0;
+ __m128i base = _mm_setzero_si128();
+
+ for (IndexType i = 0; i < NumChunks; ++i)
+ {
+ std::uint32_t nnz = 0;
+ for (IndexType j = 0; j < InputsPerChunk; ++j)
+ {
+ const vec_t chunk = reinterpret_cast(input)[i * InputsPerChunk + j];
+
+ // Since all 32-bit blocks are non-negative, it is safe to use cmpgt
+ // if the target architecture does not support cmpneq.
+#if defined(__AVX512F__) && !defined(NO_AVX512)
+ const std::uint32_t mask = _mm512_cmpneq_epi32_mask(chunk, _mm512_setzero_si512());
+#elif defined(__AVX2__)
+ const std::uint32_t mask = _mm256_movemask_ps(
+ _mm256_castsi256_ps(_mm256_cmpgt_epi32(chunk, _mm256_setzero_si256())));
+#elif defined(__AVX__)
+ const std::uint32_t mask = _mm256_movemask_ps(
+ _mm256_cmp_ps(_mm256_castsi256_ps(chunk), _mm256_setzero_ps(), _CMP_NEQ_UQ));
+#else
+ const std::uint32_t mask =
+ _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(chunk, _mm_setzero_si128())));
+#endif
+
+ nnz |= mask << (j * InputLanes);
+ }
+ for (IndexType j = 0; j < OutputsPerChunk; ++j)
+ {
+ const std::uint8_t lookup = (nnz >> (j * 8)) & 0xFF;
+ const __m128i offsets = *reinterpret_cast(&lookupIndices[lookup]);
+ _mm_storeu_si128(reinterpret_cast<__m128i_u*>(indices + count),
+ _mm_add_epi16(base, offsets));
+ count += popcount(lookup);
+ base = _mm_add_epi16(base, _mm_set1_epi16(8));
+ }
+ }
+
+ return count;
+ }
+};
+
+template
+void AffineTransformSparseInput::propagate(const InputType* input,
+ OutputType* output) const {
+#if defined(__AVX512F__) && (defined(__AVX512BW__) || defined(__AVX512VNNI__)) \
+ && !defined(NO_AVX512)
+ using vec_t = __m512i;
+#elif defined(__AVX2__)
+ using vec_t = __m256i;
+#else
+ using vec_t = __m128i;
+#endif
+
+ static constexpr IndexType OutputLanes = sizeof(vec_t) / sizeof(OutputType);
+
+ static constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4;
+ static constexpr IndexType NumRegs = OutputDimensions / OutputLanes;
+
+ vec_t acc[NumRegs];
+ std::uint16_t nnz[NumChunks];
+ IndexType count = populate_nz_indices(input, nnz);
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ acc[k] = reinterpret_cast(biases)[k];
+
+ for (IndexType j = 0; j < count; ++j)
+ {
+ const auto i = nnz[j];
+ const vec_t in = _mm_set1_epi32_v(reinterpret_cast(input)[i]);
+ const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]);
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ _mm_dpbusd_epi32_v(acc[k], in, col[k]);
+ }
+
+ for (std::size_t k = 0; k < array_size(acc); ++k)
+ reinterpret_cast(output)[k] = acc[k];
+}
+
+#else
+
+template
+using AffineTransformSparseInput = AffineTransform;
+
+#endif // __SSSE3__
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
diff --git a/src/arch/i386/nnue/layers/clipped_relu.h b/src/arch/i386/nnue/layers/clipped_relu.h
new file mode 100644
index 00000000000..187d77554f0
--- /dev/null
+++ b/src/arch/i386/nnue/layers/clipped_relu.h
@@ -0,0 +1,133 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#define I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check x86/AMD64 SIMD extensions.
+// If none is defined, fall back to the generic implementation.
+#ifndef __SSE2__
+
+#include "arch/generic/nnue/layers/clipped_relu.h"
+
+#else
+
+#include "../../arch.h"
+
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template
+void ClippedReLU::propagate(const InputType* input, OutputType* output) const {
+ static_assert(PaddedOutputDimensions % 32 == 0);
+
+ // Do not use 256-bit registers on AVX as it does not have shift
+ // instructions, instead fall back to SSE4.1.
+#ifdef __AVX2__
+
+#if defined(__AVX512F__) && defined(__AVX512BW__) && !defined(NO_AVX512)
+ if constexpr (PaddedOutputDimensions >= 64)
+ {
+ static constexpr IndexType NumChunks = PaddedOutputDimensions / 64;
+
+ static const __m512i permuteTable512 =
+ _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast<__m512i*>(output);
+
+ for (IndexType i = 0; i < NumChunks; ++i)
+ {
+ const __m512i words0 =
+ _mm512_srli_epi16(_mm512_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits);
+ const __m512i words1 =
+ _mm512_srli_epi16(_mm512_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits);
+
+ out[i] = _mm512_permutexvar_epi32(permuteTable512, _mm512_packs_epi16(words0, words1));
+ }
+ }
+ constexpr IndexType Start = PaddedOutputDimensions / 64 * 64;
+#else
+ constexpr IndexType Start = 0;
+#endif
+
+ if constexpr (Start != PaddedOutputDimensions)
+ {
+ static constexpr IndexType NumChunks = PaddedOutputDimensions / 32;
+
+ static const __m256i permuteTable256 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast<__m256i*>(output);
+
+ for (IndexType i = Start / 32; i < NumChunks; ++i)
+ {
+ const __m256i words0 =
+ _mm256_srli_epi16(_mm256_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits);
+ const __m256i words1 =
+ _mm256_srli_epi16(_mm256_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits);
+
+ out[i] =
+ _mm256_permutevar8x32_epi32(_mm256_packs_epi16(words0, words1), permuteTable256);
+ }
+ }
+
+#else // __SSE2__
+
+ static constexpr IndexType NumChunks = ceil_to_multiple(OutputDimensions, 16) / 16;
+
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast<__m128i*>(output);
+
+ for (IndexType i = 0; i < NumChunks; ++i)
+ {
+#ifdef __SSE4_1__
+ const __m128i words0 =
+ _mm_srli_epi16(_mm_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits);
+ const __m128i words1 =
+ _mm_srli_epi16(_mm_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits);
+
+ out[i] = _mm_packs_epi16(words0, words1);
+#else
+ static const __m128i s8min = _mm_set1_epi8(-0x80);
+
+ const __m128i words0 =
+ _mm_srai_epi16(_mm_packs_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits);
+ const __m128i words1 =
+ _mm_srai_epi16(_mm_packs_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits);
+ const __m128i bytes = _mm_packs_epi16(words0, words1);
+
+ out[i] = _mm_subs_epi8(_mm_adds_epi8(bytes, s8min), s8min);
+#endif
+ }
+
+#endif
+}
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // !__SSE2__
+
+#endif // I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/i386/nnue/layers/sqr_clipped_relu.h b/src/arch/i386/nnue/layers/sqr_clipped_relu.h
new file mode 100644
index 00000000000..bdab25693d0
--- /dev/null
+++ b/src/arch/i386/nnue/layers/sqr_clipped_relu.h
@@ -0,0 +1,68 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#define I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check x86/AMD64 SIMD extensions.
+// If none is defined, fall back to the generic implementation.
+#ifndef __SSE2__
+
+#include "arch/generic/nnue/layers/sqr_clipped_relu.h"
+
+#else
+
+#include "../../arch.h"
+
+#include
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template
+void SqrClippedReLU::propagate(const InputType* input, OutputType* output) const {
+ static constexpr IndexType NumChunks = PaddedOutputDimensions / 16;
+
+ const auto in = reinterpret_cast(input);
+ const auto out = reinterpret_cast<__m128i*>(output);
+
+ for (IndexType i = 0; i < NumChunks; ++i)
+ {
+ __m128i words0 = _mm_packs_epi32(in[i * 4 + 0], in[i * 4 + 1]);
+ __m128i words1 = _mm_packs_epi32(in[i * 4 + 2], in[i * 4 + 3]);
+
+ // We shift by WeightScaleBits * 2 = 12 and divide by 128 which is
+ // an additional shift-right of 7, meaning 19 in total. Mulhi
+ // strips the lower 16 bits so we need to shift by 3 more.
+ words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
+ words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
+
+ out[i] = _mm_packs_epi16(words0, words1);
+ }
+}
+
+} // namespace Stockfish::Eval::NNUE::Layers
+
+#endif // !__SSE2__
+
+#endif // I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/i386/nnue/nnue_feature_transformer.h b/src/arch/i386/nnue/nnue_feature_transformer.h
new file mode 100644
index 00000000000..a6d321f465c
--- /dev/null
+++ b/src/arch/i386/nnue/nnue_feature_transformer.h
@@ -0,0 +1,435 @@
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+#ifndef I386_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#define I386_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+
+#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check x86/AMD64 SIMD extensions.
+// If none is defined, fall back to the generic implementation.
+#ifndef __SSE2__
+
+#include "arch/generic/nnue/nnue_feature_transformer.h"
+
+#else
+
+#include "../arch.h"
+
+#include
+#include
+
+#include "misc.h"
+#include "position.h"
+#include "types.h"
+#include "nnue/nnue_accumulator.h"
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE {
+
+template