diff --git a/scripts/get_native_properties.sh b/scripts/get_native_properties.sh
deleted file mode 100755
index fb124021a31..00000000000
--- a/scripts/get_native_properties.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/bin/sh
-
-#
-# Returns properties of the native system.
-# best architecture as supported by the CPU
-# filename of the best binary uploaded as an artifact during CI
-#
-
-# Check if all the given flags are present in the CPU flags list
-check_flags() {
-  for flag; do
-    printf '%s\n' "$flags" | grep -q -w "$flag" || return 1
-  done
-}
-
-# Set the CPU flags list
-# remove underscores and points from flags, e.g. gcc uses avx512vnni, while some cpuinfo can have avx512_vnni, some systems use sse4_1 others sse4.1
-get_flags() {
-  flags=$(awk '/^flags[ \t]*:|^Features[ \t]*:/{gsub(/^flags[ \t]*:[ \t]*|^Features[ \t]*:[ \t]*|[_.]/, ""); line=$0} END{print line}' /proc/cpuinfo)
-}
-
-# Check for gcc march "znver1" or "znver2" https://en.wikichip.org/wiki/amd/cpuid
-check_znver_1_2() {
-  vendor_id=$(awk '/^vendor_id/{print $3; exit}' /proc/cpuinfo)
-  cpu_family=$(awk '/^cpu family/{print $4; exit}' /proc/cpuinfo)
-  [ "$vendor_id" = "AuthenticAMD" ] && [ "$cpu_family" = "23" ] && znver_1_2=true
-}
-
-# Set the file CPU x86_64 architecture
-set_arch_x86_64() {
-  if check_flags 'avx512vnni' 'avx512dq' 'avx512f' 'avx512bw' 'avx512vl'; then
-    true_arch='x86-64-vnni256'
-  elif check_flags 'avx512f' 'avx512bw'; then
-    true_arch='x86-64-avx512'
-  elif [ -z "${znver_1_2+1}" ] && check_flags 'bmi2'; then
-    true_arch='x86-64-bmi2'
-  elif check_flags 'avx2'; then
-    true_arch='x86-64-avx2'
-  elif check_flags 'sse41' && check_flags 'popcnt'; then
-    true_arch='x86-64-sse41-popcnt'
-  else
-    true_arch='x86-64'
-  fi
-}
-
-# Check the system type
-uname_s=$(uname -s)
-uname_m=$(uname -m)
-case $uname_s in
-  'Darwin') # Mac OSX system
-    case $uname_m in
-      'arm64')
-        true_arch='apple-silicon'
-        file_arch='x86-64-sse41-popcnt' # Supported by Rosetta 2
-        ;;
-      'x86_64')
-        flags=$(sysctl -n machdep.cpu.features machdep.cpu.leaf7_features | tr '\n' ' ' | tr '[:upper:]' '[:lower:]' | tr -d '_.')
-        set_arch_x86_64
-        if [ "$true_arch" = 'x86-64-vnni256' ] || [ "$true_arch" = 'x86-64-avx512' ]; then
-           file_arch='x86-64-bmi2'
-        fi
-        ;;
-    esac
-    file_os='macos'
-    file_ext='tar'
-    ;;
-  'Linux') # Linux system
-    get_flags
-    case $uname_m in
-      'x86_64')
-        file_os='ubuntu'
-        check_znver_1_2
-        set_arch_x86_64
-        ;;
-      'i686')
-        file_os='ubuntu'
-        true_arch='x86-32'
-        ;;
-      'aarch64')
-        file_os='android'
-        true_arch='armv8'
-        if check_flags 'asimddp'; then
-          true_arch="$true_arch-dotprod"
-        fi
-        ;;
-      'armv7'*)
-        file_os='android'
-        true_arch='armv7'
-        if check_flags 'neon'; then
-          true_arch="$true_arch-neon"
-        fi
-        ;;
-      *) # Unsupported machine type, exit with error
-        printf 'Unsupported machine type: %s\n' "$uname_m"
-        exit 1
-        ;;
-    esac
-    file_ext='tar'
-    ;;
-  'CYGWIN'*|'MINGW'*|'MSYS'*) # Windows system with POSIX compatibility layer
-    get_flags
-    check_znver_1_2
-    set_arch_x86_64
-    file_os='windows'
-    file_ext='zip'
-    ;;
-  *)
-    # Unknown system type, exit with error
-    printf 'Unsupported system type: %s\n' "$uname_s"
-    exit 1
-    ;;
-esac
-
-if [ -z "$file_arch" ]; then
-  file_arch=$true_arch
-fi
-
-file_name="stockfish-$file_os-$file_arch.$file_ext"
-
-printf '%s %s\n' "$true_arch" "$file_name"
diff --git a/src/Makefile b/src/Makefile
index 042d9479cc8..87e93cb581b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,1035 +14,400 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+default: help
 
-### ==========================================================================
-### Section 1. General Configuration
-### ==========================================================================
+.PHONY: default help strip install clean objclean profileclean net format \
+        all config-sanity analyze build profile-build \
+        gcc-profile-make gcc-profile-use \
+        clang-profile-make clang-profile-use \
+        icx-profile-make icx-profile-use
 
-### Establish the operating system name
-KERNEL := $(shell uname -s)
-ifeq ($(KERNEL),Linux)
-	OS := $(shell uname -o)
-endif
+VPATH = syzygy:nnue:nnue/features
 
-### Target Windows OS
-ifeq ($(OS),Windows_NT)
-	ifneq ($(COMP),ndk)
-		target_windows = yes
-	endif
-else ifeq ($(COMP),mingw)
-	target_windows = yes
-	ifeq ($(WINE_PATH),)
-		WINE_PATH := $(shell which wine)
-	endif
-endif
+SRCS    := $(shell find . -name "*.cpp" ! -path "./incbin/*")
+OBJS    := $(notdir $(SRCS:.cpp=.o))
+HEADERS := $(shell find . -name "*.h" ! -path "./incbin/*")
+
+INSTALL_PREFIX := /usr/local
+INSTALL_PATH   := $(INSTALL_PREFIX)/bin
 
-### Executable name
-ifeq ($(target_windows),yes)
-	EXE = stockfish.exe
+ifeq ($(OS),Windows_NT)
+    INSTALL_EXE := stockfish.exe
 else
-	EXE = stockfish
+    INSTALL_EXE := stockfish
 endif
 
-### Installation dir definitions
-PREFIX = /usr/local
-BINDIR = $(PREFIX)/bin
-
-### Built-in benchmark for pgo-builds
-PGOBENCH = $(WINE_PATH) ./$(EXE) bench
+KERNEL := $(shell uname -s)
 
-### Source and object files
-SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
-	misc.cpp movegen.cpp movepick.cpp position.cpp \
-	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp memory.cpp
+strip:
+	-@test -f stockfish && strip stockfish
+	-@test -f stockfish.exe && strip stockfish.exe
 
-HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \
-		nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \
-		nnue/layers/affine_transform_sparse_input.h nnue/layers/clipped_relu.h nnue/layers/simd.h \
-		nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
-		nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
-		search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
-		tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h
+install:
+	mkdir -p -m 755 $(INSTALL_PATH)
+	cp $(INSTALL_EXE) $(INSTALL_PATH)
+	strip $(INSTALL_PATH)/$(INSTALL_EXE)
 
-OBJS = $(notdir $(SRCS:.cpp=.o))
+clean: objclean profileclean
+	@rm -f .depend
 
-VPATH = syzygy:nnue:nnue/features
+objclean:
+	@rm -f stockfish stockfish.exe $(OBJS) *.o.tmp
 
-### ==========================================================================
-### Section 2. High-level Configuration
-### ==========================================================================
-#
-# flag                --- Comp switch        --- Description
-# ----------------------------------------------------------------------------
-#
-# debug = yes/no      --- -DNDEBUG           --- Enable/Disable debug mode
-# sanitize = none/<sanitizer> ... (-fsanitize )
-#                     --- ( undefined )      --- enable undefined behavior checks
-#                     --- ( thread    )      --- enable threading error checks
-#                     --- ( address   )      --- enable memory access checks
-#                     --- ...etc...          --- see compiler documentation for supported sanitizers
-# optimize = yes/no   --- (-O3/-fast etc.)   --- Enable/Disable optimizations
-# arch = (name)       --- (-arch)            --- Target architecture
-# bits = 64/32        --- -DIS_64BIT         --- 64-/32-bit operating system
-# prefetch = yes/no   --- -DUSE_PREFETCH     --- Use prefetch asm-instruction
-# popcnt = yes/no     --- -DUSE_POPCNT       --- Use popcnt asm-instruction
-# pext = yes/no       --- -DUSE_PEXT         --- Use pext x86_64 asm-instruction
-# sse = yes/no        --- -msse              --- Use Intel Streaming SIMD Extensions
-# mmx = yes/no        --- -mmmx              --- Use Intel MMX instructions
-# sse2 = yes/no       --- -msse2             --- Use Intel Streaming SIMD Extensions 2
-# ssse3 = yes/no      --- -mssse3            --- Use Intel Supplemental Streaming SIMD Extensions 3
-# sse41 = yes/no      --- -msse4.1           --- Use Intel Streaming SIMD Extensions 4.1
-# avx2 = yes/no       --- -mavx2             --- Use Intel Advanced Vector Extensions 2
-# avxvnni = yes/no    --- -mavxvnni          --- Use Intel Vector Neural Network Instructions AVX
-# avx512 = yes/no     --- -mavx512bw         --- Use Intel Advanced Vector Extensions 512
-# vnni256 = yes/no    --- -mavx256vnni       --- Use Intel Vector Neural Network Instructions 512 with 256bit operands
-# vnni512 = yes/no    --- -mavx512vnni       --- Use Intel Vector Neural Network Instructions 512
-# neon = yes/no       --- -DUSE_NEON         --- Use ARM SIMD architecture
-# dotprod = yes/no    --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
-#
-# Note that Makefile is space sensitive, so when adding new architectures
-# or modifying existing flags, you have to make sure there are no extra spaces
-# at the end of the line for flag values.
-#
-# Example of use for these flags:
-# make build ARCH=x86-64-avx512 debug=yes sanitize="address undefined"
+profileclean:
+	@rm -f PGOBENCH.out
+	@rm -rf profdir
+	@rm -f stockfish.profdata *.profraw
 
+net:
+	@$(SHELL) ../scripts/net.sh
 
-### 2.1. General and architecture defaults
+format: CLANG_FORMAT := $(shell command -v clang-format-18 2> /dev/null || \
+                                command -v clang-format 2> /dev/null)
+format:
+	@test -n "$(CLANG_FORMAT)" || ( \
+	  echo "clang-format not found. Please install clang-format-18."; false \
+	)
+	@$(CLANG_FORMAT) -i $(SRCS) $(HEADERS) -style=file
 
-ifeq ($(ARCH),)
-   ARCH = native
-endif
+### ==========================================================================
 
-ifeq ($(ARCH), native)
-   override ARCH := $(shell $(SHELL) ../scripts/get_native_properties.sh | cut -d " " -f 1)
-endif
+CXX_REQUIRED_RULES := analyze config-sanity build profile-build all \
+                      gcc-profile-make gcc-profile-use \
+                      clang-profile-make clang-profile-use \
+                      icx-profile-make icx-profile-use
 
-# explicitly check for the list of supported architectures (as listed with make help),
-# the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true`
-ifeq ($(ARCH), $(filter $(ARCH), \
-                 x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \
-                 x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
-                 x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \
-                 armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64 loongarch64))
-   SUPPORTED_ARCH=true
-else
-   SUPPORTED_ARCH=false
-endif
+ifeq ($(MAKELEVEL),0)
+ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)
 
 optimize = yes
 debug = no
 sanitize = none
-bits = 64
-prefetch = no
-popcnt = no
-pext = no
-sse = no
-mmx = no
-sse2 = no
-ssse3 = no
-sse41 = no
-avx2 = no
-avxvnni = no
-avx512 = no
-vnni256 = no
-vnni512 = no
-neon = no
-dotprod = no
-arm_version = 0
-STRIP = strip
-
-ifneq ($(shell which clang-format-18 2> /dev/null),)
-	CLANG-FORMAT = clang-format-18
-else
-	CLANG-FORMAT = clang-format
-endif
-
-### 2.2 Architecture specific
-
-ifeq ($(findstring x86,$(ARCH)),x86)
 
-# x86-32/64
-
-ifeq ($(findstring x86-32,$(ARCH)),x86-32)
-	arch = i386
-	bits = 32
-	sse = no
-	mmx = yes
+ifeq ($(shell command -v $(CXX) 2> /dev/null),)
+    $(error Compiler $(CXX) not found)
+endif
+
+define test-compiler-macro
+$(shell echo | $(CXX) -dM -x c++ -E - | \
+        grep -E "^#define[[:space:]]+$(1)$|([[:space:]]+.*)" > /dev/null 2>&1 && echo 1)
+endef
+
+define get-compiler-macro
+$(shell echo | $(CXX) -dM -x c++ -E - | \
+        grep -E "^#define[[:space:]]+$(1)$|([[:space:]]+.*)" | \
+        sed "s/^#define[[:space:]]\+$(1)[[:space:]]\+//")
+endef
+
+### 1. Detect compiler type
+
+ifeq ($(call test-compiler-macro,__GNUC__),1)
+    ifeq ($(call test-compiler-macro,__INTEL_LLVM_COMPILER),1)
+        $(info Using Intel oneAPI DPC++/C++ Compiler) $(info )
+        COMP := icx
+        profile_make = icx-profile-make
+        profile_use  = icx-profile-use
+    else ifeq ($(call test-compiler-macro,__clang__),1)
+        $(info Using LLVM C/C++ Compiler (Clang)) $(info )
+        COMP := clang
+        CLANG_VERSION := $(call get-compiler-macro,__clang_major__)
+        LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_VERSION) 2> /dev/null || \
+								 command -v llvm-profdata 2> /dev/null)
+        profile_make = clang-profile-make
+        profile_use  = clang-profile-use
+        export LLVM_PROFDATA
+    else
+        $(info Using GNU C/C++ Compiler) $(info )
+        COMP := gcc
+        GCC_VERSION := $(call get-compiler-macro,__GNUC__)
+        profile_make = gcc-profile-make
+        profile_use  = gcc-profile-use
+    endif
+endif
+
+ifneq ($(filter $(COMP),gcc clang),)
+    MINGW := $(call test-compiler-macro,__MINGW32__)
+endif
+
+ifeq ($(MINGW),1)
+    EXE = stockfish.exe
 else
-	arch = x86_64
-	sse = yes
-	sse2 = yes
-endif
-
-ifeq ($(findstring -sse,$(ARCH)),-sse)
-	sse = yes
-endif
-
-ifeq ($(findstring -popcnt,$(ARCH)),-popcnt)
-	popcnt = yes
+    EXE = stockfish
 endif
 
-ifeq ($(findstring -mmx,$(ARCH)),-mmx)
-	mmx = yes
-endif
-
-ifeq ($(findstring -sse2,$(ARCH)),-sse2)
-	sse = yes
-	sse2 = yes
-endif
+export COMP MINGW EXE
 
-ifeq ($(findstring -ssse3,$(ARCH)),-ssse3)
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-endif
-
-ifeq ($(findstring -sse41,$(ARCH)),-sse41)
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-endif
+### 2. Set compiler options
 
-ifeq ($(findstring -modern,$(ARCH)),-modern)
-        $(warning *** ARCH=$(ARCH) is deprecated, defaulting to ARCH=x86-64-sse41-popcnt. Execute `make help` for a list of available architectures. ***)
-        $(shell sleep 5)
-	popcnt = yes
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-endif
-
-ifeq ($(findstring -avx2,$(ARCH)),-avx2)
-	popcnt = yes
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-	avx2 = yes
-endif
+# GNU C Compiler
+# https://gcc.gnu.org/onlinedocs/gcc/Option-Index.html
+#
+# Clang Compiler
+# https://clang.llvm.org/docs/ClangCommandLineReference.html
+# https://clang.llvm.org/docs/DiagnosticsReference.html
+#
+# Intel oneAPI DPC++/C++ Compiler
+# https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/developer-guide-reference/2024-2/alphabetical-option-list.html
 
-ifeq ($(findstring -avxvnni,$(ARCH)),-avxvnni)
-	popcnt = yes
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-	avx2 = yes
-	avxvnni = yes
-	pext = yes
-endif
+### 2.1. Common options
 
-ifeq ($(findstring -bmi2,$(ARCH)),-bmi2)
-	popcnt = yes
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-	avx2 = yes
-	pext = yes
-endif
+SF_CXXFLAGS := -std=c++17 -I. -Wall -DUSE_PTHREADS
+SF_LDFLAGS  :=
 
-ifeq ($(findstring -avx512,$(ARCH)),-avx512)
-	popcnt = yes
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-	avx2 = yes
-	pext = yes
-	avx512 = yes
-endif
+SF_LIBS := pthread
 
-ifeq ($(findstring -vnni256,$(ARCH)),-vnni256)
-	popcnt = yes
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-	avx2 = yes
-	pext = yes
-	vnni256 = yes
-endif
+### 2.2. Compiler-specific options
 
-ifeq ($(findstring -vnni512,$(ARCH)),-vnni512)
-	popcnt = yes
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-	avx2 = yes
-	pext = yes
-	avx512 = yes
-	vnni512 = yes
+ifeq ($(COMP),gcc)
+    SF_CXXFLAGS += -pedantic -Wextra -Wcast-qual -Wmissing-declarations \
+                   -Wshadow
+else ifeq ($(COMP),clang)
+    SF_CXXFLAGS += -pedantic -Wextra -Wcast-qual -Wconditional-uninitialized \
+                -Wmissing-prototypes -Wshadow
+else ifeq ($(COMP),icx)
+    SF_CXXFLAGS += -Wabi -Wmissing-declarations -Wmissing-prototypes -Wshadow
 endif
 
-ifeq ($(sse),yes)
-	prefetch = yes
-endif
+### 2.3. Optimization options
 
-# 64-bit pext is not available on x86-32
-ifeq ($(bits),32)
-	pext = no
-endif
+ifeq ($(optimize),yes)
+    SF_CXXFLAGS += -O3
+
+    ifeq ($(COMP),gcc)
+        SF_CXXFLAGS += -funroll-loops
+        ifeq ($(shell expr $(GCC_VERSION) \< 12),1)
+            SF_CXXFLAGS += -flto
+            SF_LDFLAGS  += -flto
+        else
+            SF_CXXFLAGS += -flto=jobserver
+            SF_LDFLAGS  += -flto=jobserver
+        endif
+        SF_CXXFLAGS += -flto-partition=one
+        SF_LDFLAGS  += -flto-partition=one
+    else ifeq ($(COMP),clang)
+        SF_CXXFLAGS += -funroll-loops -flto=full
+        SF_LDFLAGS  += -flto=full
+        ifeq ($(shell expr $(CLANG_VERSION) \< 16),1)
+            SF_CXXFLAGS += -fexperimental-new-pass-manager
+        endif
+	else ifeq ($(COMP),icx)
+        SF_CXXFLAGS += -flto=full
+        SF_LDFLAGS  += -flto=full
+    endif
+endif
+
+### 2.4. Debug options
 
+ifeq ($(debug),no)
+    SF_CXXFLAGS += -DNDEBUG
 else
-
-# all other architectures
-
-ifeq ($(ARCH),general-32)
-	arch = any
-	bits = 32
-endif
-
-ifeq ($(ARCH),general-64)
-	arch = any
-endif
-
-ifeq ($(ARCH),armv7)
-	arch = armv7
-	prefetch = yes
-	bits = 32
-	arm_version = 7
-endif
-
-ifeq ($(ARCH),armv7-neon)
-	arch = armv7
-	prefetch = yes
-	popcnt = yes
-	neon = yes
-	bits = 32
-	arm_version = 7
-endif
-
-ifeq ($(ARCH),armv8)
-	arch = armv8
-	prefetch = yes
-	popcnt = yes
-	neon = yes
-	arm_version = 8
-endif
-
-ifeq ($(ARCH),armv8-dotprod)
-	arch = armv8
-	prefetch = yes
-	popcnt = yes
-	neon = yes
-	dotprod = yes
-	arm_version = 8
-endif
-
-ifeq ($(ARCH),apple-silicon)
-	arch = arm64
-	prefetch = yes
-	popcnt = yes
-	neon = yes
-	dotprod = yes
-	arm_version = 8
-endif
-
-ifeq ($(ARCH),ppc-32)
-	arch = ppc
-	bits = 32
-endif
-
-ifeq ($(ARCH),ppc-64)
-	arch = ppc64
-	popcnt = yes
-	prefetch = yes
-endif
-
-ifeq ($(findstring e2k,$(ARCH)),e2k)
-	arch = e2k
-	mmx = yes
-	bits = 64
-	sse = yes
-	sse2 = yes
-	ssse3 = yes
-	sse41 = yes
-	popcnt = yes
-endif
-
-ifeq ($(ARCH),riscv64)
-	arch = riscv64
-endif
-
-ifeq ($(ARCH),loongarch64)
-	arch = loongarch64
+    SF_CXXFLAGS += -g
 endif
-endif
-
 
-### ==========================================================================
-### Section 3. Low-level Configuration
-### ==========================================================================
+### 2.5. Sanitizer options
 
-### 3.1 Selecting compiler (default = gcc)
-ifeq ($(MAKELEVEL),0)
-       export ENV_CXXFLAGS := $(CXXFLAGS)
-       export ENV_DEPENDFLAGS := $(DEPENDFLAGS)
-       export ENV_LDFLAGS := $(LDFLAGS)
+ifneq ($(sanitize),none)
+    SF_CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize))
 endif
 
-CXXFLAGS = $(ENV_CXXFLAGS) -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS)
-DEPENDFLAGS = $(ENV_DEPENDFLAGS) -std=c++17
-LDFLAGS = $(ENV_LDFLAGS) $(EXTRALDFLAGS)
+### 2.6. Include Git commit hash and date
 
-ifeq ($(COMP),)
-	COMP=gcc
+GIT_SHA := $(shell git rev-parse --short=8 HEAD 2> /dev/null)
+ifneq ($(GIT_SHA),)
+    SF_CXXFLAGS += -DGIT_SHA=$(GIT_SHA)
 endif
 
-ifeq ($(COMP),gcc)
-	comp=gcc
-	CXX=g++
-	CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations
-
-	ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64))
-		ifeq ($(OS),Android)
-			CXXFLAGS += -m$(bits)
-			LDFLAGS += -m$(bits)
-		endif
-		ifeq ($(ARCH),riscv64)
-			CXXFLAGS += -latomic
-		endif
-	else ifeq ($(ARCH),loongarch64)
-		CXXFLAGS += -latomic
-	else
-		CXXFLAGS += -m$(bits)
-		LDFLAGS += -m$(bits)
-	endif
-
-	ifeq ($(arch),$(filter $(arch),armv7))
-		LDFLAGS += -latomic
-	endif
-
-	ifneq ($(KERNEL),Darwin)
-	   LDFLAGS += -Wl,--no-as-needed
-	endif
+GIT_DATE := $(shell git show -s --date=format:'%Y%m%d' --format=%cd HEAD 2> /dev/null)
+ifneq ($(GIT_DATE),)
+    SF_CXXFLAGS += -DGIT_DATE=$(GIT_DATE)
 endif
 
-ifeq ($(target_windows),yes)
-	LDFLAGS += -static
-endif
+### 2.7. Add flags based on target OS
 
-ifeq ($(COMP),mingw)
-	comp=mingw
-
-	ifeq ($(bits),64)
-		ifeq ($(shell which x86_64-w64-mingw32-c++-posix 2> /dev/null),)
-			CXX=x86_64-w64-mingw32-c++
-		else
-			CXX=x86_64-w64-mingw32-c++-posix
-		endif
-	else
-		ifeq ($(shell which i686-w64-mingw32-c++-posix 2> /dev/null),)
-			CXX=i686-w64-mingw32-c++
-		else
-			CXX=i686-w64-mingw32-c++-posix
-		endif
-	endif
-	CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations
+ifeq ($(MINGW),1)
+    SF_LDFLAGS += -static
 endif
 
-ifeq ($(COMP),icx)
-	comp=icx
-	CXX=icpx
-	CXXFLAGS += --intel -pedantic -Wextra -Wshadow -Wmissing-prototypes \
-		-Wconditional-uninitialized -Wabi -Wdeprecated
-endif
-
-ifeq ($(COMP),clang)
-	comp=clang
-	CXX=clang++
-	ifeq ($(target_windows),yes)
-		CXX=x86_64-w64-mingw32-clang++
-	endif
-
-	CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-prototypes \
-	            -Wconditional-uninitialized
-
-	ifeq ($(filter $(KERNEL),Darwin OpenBSD FreeBSD),)
-	ifeq ($(target_windows),)
-	ifneq ($(RTLIB),compiler-rt)
-		LDFLAGS += -latomic
-	endif
-	endif
-	endif
-
-	ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64))
-		ifeq ($(OS),Android)
-			CXXFLAGS += -m$(bits)
-			LDFLAGS += -m$(bits)
-		endif
-		ifeq ($(ARCH),riscv64)
-			CXXFLAGS += -latomic
-		endif
-	else ifeq ($(ARCH),loongarch64)
-		CXXFLAGS += -latomic
-	else
-		CXXFLAGS += -m$(bits)
-		LDFLAGS += -m$(bits)
-	endif
-endif
+endif   # CXX_REQUIRED_RULES
 
-ifeq ($(KERNEL),Darwin)
-	CXXFLAGS += -mmacosx-version-min=10.15
-	LDFLAGS += -mmacosx-version-min=10.15
-	ifneq ($(arch),any)
-		CXXFLAGS += -arch $(arch)
-		LDFLAGS += -arch $(arch)
-	endif
-	XCRUN = xcrun
-endif
+### 3. Add flags from architecture-specific Makefile
+###   Note that this section is not enclosed in the CXX_REQUIRED_RULES block;
+###   Users shall be able to see the help text even when there is no compiler.
 
-# To cross-compile for Android, NDK version r21 or later is recommended.
-# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils.
-# Currently we don't know how to make PGO builds with the NDK yet.
-ifeq ($(COMP),ndk)
-	CXXFLAGS += -stdlib=libc++ -fPIE
-	comp=clang
-	ifeq ($(arch),armv7)
-		CXX=armv7a-linux-androideabi16-clang++
-		CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon
-		ifneq ($(shell which arm-linux-androideabi-strip 2>/dev/null),)
-			STRIP=arm-linux-androideabi-strip
-		else
-			STRIP=llvm-strip
-		endif
-	endif
-	ifeq ($(arch),armv8)
-		CXX=aarch64-linux-android21-clang++
-		ifneq ($(shell which aarch64-linux-android-strip 2>/dev/null),)
-			STRIP=aarch64-linux-android-strip
-		else
-			STRIP=llvm-strip
-		endif
-	endif
-	ifeq ($(arch),x86_64)
-		CXX=x86_64-linux-android21-clang++
-		ifneq ($(shell which x86_64-linux-android-strip 2>/dev/null),)
-			STRIP=x86_64-linux-android-strip
-		else
-			STRIP=llvm-strip
-		endif
-	endif
-	LDFLAGS += -static-libstdc++ -pie -lm -latomic
+ifeq ($(ARCH),)
+    override ARCH := native
 endif
 
-ifeq ($(comp),icx)
-	profile_make = icx-profile-make
-	profile_use = icx-profile-use
-else ifeq ($(comp),clang)
-	profile_make = clang-profile-make
-	profile_use = clang-profile-use
+ifeq ($(ARCH),native)
+    ARCH_NATIVE := y
+    SF_CXXFLAGS += -march=native -DARCH_NATIVE
 else
-	profile_make = gcc-profile-make
-	profile_use = gcc-profile-use
-	ifeq ($(KERNEL),Darwin)
-		EXTRAPROFILEFLAGS = -fvisibility=hidden
-	endif
-endif
+    ifneq ($(filter x86%,$(ARCH)),)
+        ARCH_FAMILY := i386
+    else ifneq ($(filter arm%,$(ARCH)),)
+        ARCH_FAMILY := arm
+    else
+        ARCH_FAMILY := generic
+    endif
 
-### Allow overwriting CXX from command line
-ifdef COMPCXX
-	CXX=$(COMPCXX)
+    include ./arch/$(ARCH_FAMILY)/Makefile
 endif
 
-### Sometimes gcc is really clang
-ifeq ($(COMP),gcc)
-	gccversion := $(shell $(CXX) --version 2>/dev/null)
-	gccisclang := $(findstring clang,$(gccversion))
-	ifneq ($(gccisclang),)
-		profile_make = clang-profile-make
-		profile_use = clang-profile-use
-	endif
-endif
+export ARCH
 
-### On mingw use Windows threads, otherwise POSIX
-ifneq ($(comp),mingw)
-	CXXFLAGS += -DUSE_PTHREADS
-	# On Android Bionic's C library comes with its own pthread implementation bundled in
-	ifneq ($(OS),Android)
-		# Haiku has pthreads in its libroot, so only link it in on other platforms
-		ifneq ($(KERNEL),Haiku)
-			ifneq ($(COMP),ndk)
-				LDFLAGS += -lpthread
-			endif
-		endif
-	endif
-endif
+SF_CXXFLAGS += -DARCH=$(ARCH)
 
-### 3.2.1 Debugging
-ifeq ($(debug),no)
-	CXXFLAGS += -DNDEBUG
-else
-	CXXFLAGS += -g
-endif
-
-### 3.2.2 Debugging with undefined behavior sanitizers
-ifneq ($(sanitize),none)
-        CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize))
-        LDFLAGS += $(addprefix -fsanitize=,$(sanitize))
-endif
+### 4. Extra flags for cross-compilation
+###   Information of target architecture is needed here.
 
-### 3.3 Optimization
-ifeq ($(optimize),yes)
+ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)
 
-	CXXFLAGS += -O3 -funroll-loops
-
-	ifeq ($(comp),gcc)
-		ifeq ($(OS), Android)
-			CXXFLAGS += -fno-gcse -mthumb -march=armv7-a -mfloat-abi=softfp
-		endif
-	endif
-
-	ifeq ($(KERNEL),Darwin)
-		ifeq ($(comp),$(filter $(comp),clang icx))
-			CXXFLAGS += -mdynamic-no-pic
-		endif
-
-		ifeq ($(comp),gcc)
-			ifneq ($(arch),arm64)
-				CXXFLAGS += -mdynamic-no-pic
-			endif
-		endif
-	endif
-
-	ifeq ($(comp),clang)
-		clangmajorversion := $(shell $(CXX) -dumpversion 2>/dev/null | cut -f1 -d.)
-		ifeq ($(shell expr $(clangmajorversion) \< 16),1)
-			CXXFLAGS += -fexperimental-new-pass-manager
-		endif
-	endif
+# Android NDK
+ifneq ($(filter $(ARCH_FAMILY),i386 arm),)
+    ifeq ($(call test-compiler-macro,__ANDROID__),1)
+        SF_CXXFLAGS += -stdlib=libc++ -fPIE
+        SF_LDFLAGS  += -static-libstdc++ -pie
+        SF_LIBS     += m atomic
+    endif
 endif
 
-### 3.4 Bits
-ifeq ($(bits),64)
-	CXXFLAGS += -DIS_64BIT
+# Link atomic library if not i386/arm family
+ifneq ($(ARCH_NATIVE),y)
+    ifeq ($(filter $(ARCH_FAMILY),i386 arm),)
+        SF_LIBS += atomic
+    endif
 endif
 
-### 3.5 prefetch and popcount
-ifeq ($(prefetch),yes)
-	ifeq ($(sse),yes)
-		CXXFLAGS += -msse
-	endif
-else
-	CXXFLAGS += -DNO_PREFETCH
-endif
+endif   # CXX_REQUIRED_RULES
+endif   # MAKELEVEL=0
 
-ifeq ($(popcnt),yes)
-	ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8 arm64))
-		CXXFLAGS += -DUSE_POPCNT
-	else
-		CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT
-	endif
-endif
+SF_CXXFLAGS := $(strip $(SF_CXXFLAGS) $(CXXFLAGS))
+SF_LDFLAGS  := $(strip $(SF_LDFLAGS) $(LDFLAGS))
+SF_LIBS     := $(strip $(SF_LIBS) $(LIBS))
 
-### 3.6 SIMD architectures
-ifeq ($(avx2),yes)
-	CXXFLAGS += -DUSE_AVX2
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mavx2 -mbmi
-	endif
-endif
+export SF_CXXFLAGS SF_LDFLAGS SF_LIBS
 
-ifeq ($(avxvnni),yes)
-	CXXFLAGS += -DUSE_VNNI -DUSE_AVXVNNI
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mavxvnni
-	endif
-endif
+### ==========================================================================
 
-ifeq ($(avx512),yes)
-	CXXFLAGS += -DUSE_AVX512
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mavx512f -mavx512bw
-	endif
-endif
+define HELP_STRING
+To see architecture-specific build options, run 'make help ARCH=<arch>'.
+Currently supported values: x86, arm, generic
 
-ifeq ($(vnni256),yes)
-	CXXFLAGS += -DUSE_VNNI
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256
-	endif
-endif
+How-to-build examples:
 
-ifeq ($(vnni512),yes)
-	CXXFLAGS += -DUSE_VNNI
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=512
-	endif
-endif
+    make profile-build
 
-ifeq ($(sse41),yes)
-	CXXFLAGS += -DUSE_SSE41
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -msse4.1
-	endif
-endif
+Build Stockfish with profile-guided optimization (PGO) for the current
+architecture.
 
-ifeq ($(ssse3),yes)
-	CXXFLAGS += -DUSE_SSSE3
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mssse3
-	endif
-endif
+    make build ARCH=x86-64-avx2 CXX=clang++-18
 
-ifeq ($(sse2),yes)
-	CXXFLAGS += -DUSE_SSE2
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -msse2
-	endif
-endif
+Build Stockfish for the x86-64 architecture with AVX2/BMI2 support using clang++-18.
 
-ifeq ($(mmx),yes)
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mmmx
-	endif
-endif
+Check the Stockfish wiki for advanced build configuration.
 
-ifeq ($(neon),yes)
-	CXXFLAGS += -DUSE_NEON=$(arm_version)
-	ifeq ($(KERNEL),Linux)
-	ifneq ($(COMP),ndk)
-	ifneq ($(arch),armv8)
-		CXXFLAGS += -mfpu=neon
-	endif
-	endif
-	endif
-endif
+endef
+export HELP_STRING
 
-ifeq ($(dotprod),yes)
-	CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_NEON_DOTPROD
-endif
+# Print how-to-build help text if architecture is not set, otherwise
+# list all available build presets for the selected architecture.
+ifneq ($(ARCH_NATIVE),y)
 
-### 3.7 pext
-ifeq ($(pext),yes)
-	CXXFLAGS += -DUSE_PEXT
-	ifeq ($(comp),$(filter $(comp),gcc clang mingw icx))
-		CXXFLAGS += -mbmi2
-	endif
-endif
+help: help-arch
 
-### 3.8.1 Try to include git commit sha for versioning
-GIT_SHA := $(shell git rev-parse HEAD 2>/dev/null | cut -c 1-8)
-ifneq ($(GIT_SHA), )
-	CXXFLAGS += -DGIT_SHA=$(GIT_SHA)
-endif
+config-sanity: config-sanity-arch
 
-### 3.8.2 Try to include git commit date for versioning
-GIT_DATE := $(shell git show -s --date=format:'%Y%m%d' --format=%cd HEAD 2>/dev/null)
-ifneq ($(GIT_DATE), )
-	CXXFLAGS += -DGIT_DATE=$(GIT_DATE)
-endif
+else
 
-### 3.8.3 Try to include architecture
-ifneq ($(ARCH), )
-	CXXFLAGS += -DARCH=$(ARCH)
-endif
+help:
+	@echo "$${HELP_STRING}"
 
-### 3.9 Link Time Optimization
-### This is a mix of compile and link time options because the lto link phase
-### needs access to the optimization flags.
-ifeq ($(optimize),yes)
-ifeq ($(debug), no)
-	ifeq ($(comp),$(filter $(comp),clang icx))
-		CXXFLAGS += -flto=full
-		ifeq ($(comp),icx)
-			CXXFLAGS += -fwhole-program-vtables
-                endif
-		ifeq ($(target_windows),yes)
-			CXXFLAGS += -fuse-ld=lld
-		endif
-		LDFLAGS += $(CXXFLAGS)
-
-# GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be
-# GCC on some systems.
-	else ifeq ($(comp),gcc)
-	ifeq ($(gccisclang),)
-		CXXFLAGS += -flto -flto-partition=one
-		LDFLAGS += $(CXXFLAGS) -flto=jobserver
-	else
-		CXXFLAGS += -flto=full
-		LDFLAGS += $(CXXFLAGS)
-	endif
-
-# To use LTO and static linking on Windows,
-# the tool chain requires gcc version 10.1 or later.
-	else ifeq ($(comp),mingw)
-		CXXFLAGS += -flto -flto-partition=one
-		LDFLAGS += $(CXXFLAGS) -save-temps
-	endif
-endif
 endif
 
-### 3.10 Android 5 can only run position independent executables. Note that this
-### breaks Android 4.0 and earlier.
-ifeq ($(OS), Android)
-	CXXFLAGS += -fPIE
-	LDFLAGS += -fPIE -pie
-endif
+define CONFIG_SANITY_STRING
 
-### ==========================================================================
-### Section 4. Public Targets
-### ==========================================================================
+Build options:
+  optimize: $(optimize)
+  debug: $(debug)
+  sanitize: $(sanitize)
 
-help:
-	@echo ""
-	@echo "To compile stockfish, type: "
-	@echo ""
-	@echo "make -j target [ARCH=arch] [COMP=compiler] [COMPCXX=cxx]"
-	@echo ""
-	@echo "Supported targets:"
-	@echo ""
-	@echo "help                    > Display architecture details"
-	@echo "profile-build           > standard build with profile-guided optimization"
-	@echo "build                   > skip profile-guided optimization"
-	@echo "net                     > Download the default nnue nets"
-	@echo "strip                   > Strip executable"
-	@echo "install                 > Install executable"
-	@echo "clean                   > Clean up"
-	@echo ""
-	@echo "Supported archs:"
-	@echo ""
-	@echo "native                  > select the best architecture for the host processor (default)"
-	@echo "x86-64-vnni512          > x86 64-bit with vnni 512bit support"
-	@echo "x86-64-vnni256          > x86 64-bit with vnni 512bit support, limit operands to 256bit wide"
-	@echo "x86-64-avx512           > x86 64-bit with avx512 support"
-	@echo "x86-64-avxvnni          > x86 64-bit with vnni 256bit support"
-	@echo "x86-64-bmi2             > x86 64-bit with bmi2 support"
-	@echo "x86-64-avx2             > x86 64-bit with avx2 support"
-	@echo "x86-64-sse41-popcnt     > x86 64-bit with sse41 and popcnt support"
-	@echo "x86-64-modern           > deprecated, currently x86-64-sse41-popcnt"
-	@echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
-	@echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 compile and popcnt support"
-	@echo "x86-64                  > x86 64-bit generic (with sse2 support)"
-	@echo "x86-32-sse41-popcnt     > x86 32-bit with sse41 and popcnt support"
-	@echo "x86-32-sse2             > x86 32-bit with sse2 support"
-	@echo "x86-32                  > x86 32-bit generic (with mmx compile support)"
-	@echo "ppc-64                  > PPC 64-bit"
-	@echo "ppc-32                  > PPC 32-bit"
-	@echo "armv7                   > ARMv7 32-bit"
-	@echo "armv7-neon              > ARMv7 32-bit with popcnt and neon"
-	@echo "armv8                   > ARMv8 64-bit with popcnt and neon"
-	@echo "armv8-dotprod           > ARMv8 64-bit with popcnt, neon and dot product support"
-	@echo "e2k                     > Elbrus 2000"
-	@echo "apple-silicon           > Apple silicon ARM64"
-	@echo "general-64              > unspecified 64-bit"
-	@echo "general-32              > unspecified 32-bit"
-	@echo "riscv64                 > RISC-V 64-bit"
-	@echo "loongarch64             > LoongArch 64-bit"
-	@echo ""
-	@echo "Supported compilers:"
-	@echo ""
-	@echo "gcc                     > GNU compiler (default)"
-	@echo "mingw                   > GNU compiler with MinGW under Windows"
-	@echo "clang                   > LLVM Clang compiler"
-	@echo "icx                     > Intel oneAPI DPC++/C++ Compiler"
-	@echo "ndk                     > Google NDK to cross-compile for Android"
-	@echo ""
-	@echo "Simple examples. If you don't know what to do, you likely want to run one of: "
-	@echo ""
-	@echo "make -j profile-build ARCH=x86-64-avx2    # typically a fast compile for common systems "
-	@echo "make -j profile-build ARCH=x86-64-sse41-popcnt  # A more portable compile for 64-bit systems "
-	@echo "make -j profile-build ARCH=x86-64         # A portable compile for 64-bit systems "
-	@echo ""
-	@echo "Advanced examples, for experienced users: "
-	@echo ""
-	@echo "make -j profile-build ARCH=x86-64-avxvnni"
-	@echo "make -j profile-build ARCH=x86-64-avxvnni COMP=gcc COMPCXX=g++-12.0"
-	@echo "make -j build ARCH=x86-64-ssse3 COMP=clang"
-	@echo ""
-ifneq ($(SUPPORTED_ARCH), true)
-	@echo "Specify a supported architecture with the ARCH option for more details"
-	@echo ""
-endif
+Compiler options:
+  CXX: $(CXX)
+  CXXFLAGS: $(SF_CXXFLAGS)
+  LDFLAGS: $(SF_LDFLAGS) $(SF_LIBS:%=-l%)
 
+endef
+export CONFIG_SANITY_STRING
 
-.PHONY: help analyze build profile-build strip install clean net \
-	objclean profileclean config-sanity \
-	icx-profile-use icx-profile-make \
-	gcc-profile-use gcc-profile-make \
-	clang-profile-use clang-profile-make FORCE \
-	format analyze
+config-sanity: net
+	@[ "$(optimize)" = "yes" -o "$(optimize)" = "no" ]
+	@[ "$(debug)" = "yes" -o "$(debug)" = "no" ]
+	@[ ! -z "$(sanitize)" ]
+	@echo "$${CONFIG_SANITY_STRING}"
 
-analyze: net config-sanity objclean
-	$(MAKE) -k ARCH=$(ARCH) COMP=$(COMP) $(OBJS)
+analyze: config-sanity objclean
+	@$(MAKE) -k --no-print-directory CXXFLAGS="" LDFLAGS="" $(OBJS)
 
-build: net config-sanity
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
+build: config-sanity
+	@$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" all
 
-profile-build: net config-sanity objclean profileclean
-	@echo ""
+profile-build: config-sanity objclean profileclean
 	@echo "Step 1/4. Building instrumented executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
-	@echo ""
-	@echo "Step 2/4. Running benchmark for pgo-build ..."
-	$(PGOBENCH) > PGOBENCH.out 2>&1
-	tail -n 4 PGOBENCH.out
-	@echo ""
-	@echo "Step 3/4. Building optimized executable ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use)
-	@echo ""
-	@echo "Step 4/4. Deleting profile data ..."
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean
-
-strip:
-	$(STRIP) $(EXE)
-
-install:
-	-mkdir -p -m 755 $(BINDIR)
-	-cp $(EXE) $(BINDIR)
-	$(STRIP) $(BINDIR)/$(EXE)
-
-# clean all
-clean: objclean profileclean
-	@rm -f .depend *~ core
-
-# clean binaries and objects
-objclean:
-	@rm -f stockfish stockfish.exe *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o
-
-# clean auxiliary profiling files
-profileclean:
-	@rm -rf profdir
-	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s PGOBENCH.out
-	@rm -f stockfish.profdata *.profraw
-	@rm -f stockfish.*args*
-	@rm -f stockfish.*lt*
-	@rm -f stockfish.res
-	@rm -f ./-lstdc++.res
-
-# evaluation network (nnue)
-net:
-	@$(SHELL) ../scripts/net.sh
-
-format:
-	$(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file
-
-# default target
-default:
-	help
-
-### ==========================================================================
-### Section 5. Private Targets
-### ==========================================================================
+	@$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_make)
+	@printf "\n%s\n" "Step 2/4. Running benchmark for pgo-build ..."
+	@$(EMULATE) ./$(EXE) bench > PGOBENCH.out 2>&1
+	@tail -n 4 PGOBENCH.out
+	@printf "\n%s\n" "Step 3/4. Building optimized executable ..."
+	@$(MAKE) --no-print-directory objclean
+	@$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_use)
+	@printf "\n%s\n" "Step 4/4. Deleting profile data ..."
+	@$(MAKE) --no-print-directory profileclean
 
 all: $(EXE) .depend
 
-config-sanity: net
-	@echo ""
-	@echo "Config:"
-	@echo "debug: '$(debug)'"
-	@echo "sanitize: '$(sanitize)'"
-	@echo "optimize: '$(optimize)'"
-	@echo "arch: '$(arch)'"
-	@echo "bits: '$(bits)'"
-	@echo "kernel: '$(KERNEL)'"
-	@echo "os: '$(OS)'"
-	@echo "prefetch: '$(prefetch)'"
-	@echo "popcnt: '$(popcnt)'"
-	@echo "pext: '$(pext)'"
-	@echo "sse: '$(sse)'"
-	@echo "mmx: '$(mmx)'"
-	@echo "sse2: '$(sse2)'"
-	@echo "ssse3: '$(ssse3)'"
-	@echo "sse41: '$(sse41)'"
-	@echo "avx2: '$(avx2)'"
-	@echo "avxvnni: '$(avxvnni)'"
-	@echo "avx512: '$(avx512)'"
-	@echo "vnni256: '$(vnni256)'"
-	@echo "vnni512: '$(vnni512)'"
-	@echo "neon: '$(neon)'"
-	@echo "dotprod: '$(dotprod)'"
-	@echo "arm_version: '$(arm_version)'"
-	@echo "target_windows: '$(target_windows)'"
-	@echo ""
-	@echo "Flags:"
-	@echo "CXX: $(CXX)"
-	@echo "CXXFLAGS: $(CXXFLAGS)"
-	@echo "LDFLAGS: $(LDFLAGS)"
-	@echo ""
-	@echo "Testing config sanity. If this fails, try 'make help' ..."
-	@echo ""
-	@test "$(debug)" = "yes" || test "$(debug)" = "no"
-	@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
-	@test "$(SUPPORTED_ARCH)" = "true"
-	@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
-	 test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || test "$(arch)" = "e2k" || \
-	 test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" || test "$(arch)" = "riscv64" || test "$(arch)" = "loongarch64"
-	@test "$(bits)" = "32" || test "$(bits)" = "64"
-	@test "$(prefetch)" = "yes" || test "$(prefetch)" = "no"
-	@test "$(popcnt)" = "yes" || test "$(popcnt)" = "no"
-	@test "$(pext)" = "yes" || test "$(pext)" = "no"
-	@test "$(sse)" = "yes" || test "$(sse)" = "no"
-	@test "$(mmx)" = "yes" || test "$(mmx)" = "no"
-	@test "$(sse2)" = "yes" || test "$(sse2)" = "no"
-	@test "$(ssse3)" = "yes" || test "$(ssse3)" = "no"
-	@test "$(sse41)" = "yes" || test "$(sse41)" = "no"
-	@test "$(avx2)" = "yes" || test "$(avx2)" = "no"
-	@test "$(avx512)" = "yes" || test "$(avx512)" = "no"
-	@test "$(vnni256)" = "yes" || test "$(vnni256)" = "no"
-	@test "$(vnni512)" = "yes" || test "$(vnni512)" = "no"
-	@test "$(neon)" = "yes" || test "$(neon)" = "no"
-	@test "$(comp)" = "gcc" || test "$(comp)" = "icx" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \
-	|| test "$(comp)" = "armv7a-linux-androideabi16-clang"  || test "$(comp)" = "aarch64-linux-android21-clang"
-
 $(EXE): $(OBJS)
-	+$(CXX) -o $@ $(OBJS) $(LDFLAGS)
+	+$(CXX) $(SF_LDFLAGS) -o $@ $(OBJS) $(SF_LIBS:%=-l%)
+
+%.o: %.cpp
+	+$(CXX) $(SF_CXXFLAGS) -c -o $@ $<
 
 # Force recompilation to ensure version info is up-to-date
 misc.o: FORCE
 FORCE:
 
-clang-profile-make:
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS='-fprofile-generate ' \
-	EXTRALDFLAGS=' -fprofile-generate' \
-	all
-
-clang-profile-use:
-	$(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS='-fprofile-use=stockfish.profdata' \
-	EXTRALDFLAGS='-fprofile-use ' \
-	all
+.depend: $(SRCS)
+	-@$(CXX) $(SF_CXXFLAGS) -MM $(SRCS) > $@ && \
+	  printf "%s\n\n" "Dependency updated, restarting Make..."
 
 gcc-profile-make:
 	@mkdir -p profdir
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS='-fprofile-generate=profdir' \
-	EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \
-	EXTRALDFLAGS='-lgcov' \
-	all
+	@$(MAKE) --no-print-directory \
+		CXXFLAGS="-fprofile-generate=profdir" LDFLAGS="" LIBS="gcov" all
 
 gcc-profile-use:
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS='-fprofile-use=profdir -fno-peel-loops -fno-tracer' \
-	EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \
-	EXTRALDFLAGS='-lgcov' \
-	all
+	@$(MAKE) --no-print-directory \
+		CXXFLAGS="-fprofile-use=profdir -fno-peel-loops -fno-tracer" LDFLAGS="" LIBS="gcov" all
+
+clang-profile-make:
+	@$(MAKE) --no-print-directory \
+		CXXFLAGS="-fprofile-generate" LDFLAGS="-fprofile-generate" all
+
+clang-profile-use:
+	$(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw
+	@$(MAKE) --no-print-directory \
+		CXXFLAGS="-fprofile-use=stockfish.profdata" \
+		LDFLAGS="-fprofile-use=stockfish.profdata" \
+		all
 
 icx-profile-make:
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS='-fprofile-instr-generate ' \
-	EXTRALDFLAGS=' -fprofile-instr-generate' \
-	all
+	@$(MAKE) --no-print-directory \
+		CXXFLAGS="-fprofile-instr-generate" LDFLAGS="-fprofile-instr-generate" all
 
 icx-profile-use:
-	$(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
-	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \
-	EXTRALDFLAGS='-fprofile-use ' \
-	all
+	@$(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw
+	@$(MAKE) --no-print-directory \
+		CXXFLAGS="-fprofile-instr-use=stockfish.profdata" LDFLAGS="-fprofile-use" all
 
-.depend: $(SRCS)
-	-@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null
-
-ifeq (, $(filter $(MAKECMDGOALS), help strip install clean net objclean profileclean config-sanity))
+ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),)
 -include .depend
-endif
+endif 
diff --git a/src/arch/.clang-format b/src/arch/.clang-format
new file mode 100644
index 00000000000..50b996b1a8a
--- /dev/null
+++ b/src/arch/.clang-format
@@ -0,0 +1,5 @@
+BasedOnStyle: InheritParentConfig
+
+# Architecture specific files use a lot of preprocessor directives.
+# Do not indent them for better readability.
+IndentPPDirectives: None 
diff --git a/src/arch/arm/Makefile b/src/arch/arm/Makefile
new file mode 100644
index 00000000000..6f33d3e1576
--- /dev/null
+++ b/src/arch/arm/Makefile
@@ -0,0 +1,43 @@
+define HELP_STRING_ARM
+To build Stockfish, run the following command:
+
+make <build-target> [ARCH=<preset>] [CPU=<cpu>]
+
+Build presets for ARM/AArch64 architecture:
+
+armv8-dotprod           > ARMv8.2-A 64-bit, Neon with DotProd feature
+armv8                   > ARMv8-A 64-bit, Neon (Advanced SIMD)
+armv7-neon              > ARMv7-A 32-bit, Neon (Advanced SIMD)
+armv7                   > ARMv7-A 32-bit, no SIMD support
+
+Stockfish does not support AArch32 targets.
+Stockfish does not support non-A profile architectures.
+
+endef
+export HELP_STRING_ARM
+
+ifneq ($(filter $(COMP),gcc clang icx),)
+
+ifeq ($(ARCH),armv8-dotprod)
+    SF_CXXFLAGS += -march=armv8.2-a+dotprod
+else ifeq ($(ARCH),armv8)
+    SF_CXXFLAGS += -march=armv8-a
+else ifeq ($(ARCH),armv7-neon)
+    SF_CXXFLAGS += -march=armv7-a -mfpu=neon -mfloat-abi=softfp
+else ifeq ($(ARCH),armv7)
+    SF_CXXFLAGS += -march=armv7-a
+endif
+
+endif   # gcc clang icx
+
+.PHONY: help-arch config-sanity-arch
+
+help-arch:
+	@echo "$${HELP_STRING_ARM}"
+
+config-sanity-arch:
+	@[ "$(ARCH)" = "armv8-dotprod" -o \
+	   "$(ARCH)" = "armv8" -o \
+	   "$(ARCH)" = "armv7-neon" -o \
+	   "$(ARCH)" = "armv7" \
+	]
diff --git a/src/arch/arm/arch.h b/src/arch/arm/arch.h
new file mode 100644
index 00000000000..30f50451aaf
--- /dev/null
+++ b/src/arch/arm/arch.h
@@ -0,0 +1,119 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef ARM_ARCH_H_INCLUDED
+#define ARM_ARCH_H_INCLUDED
+
+#if !defined(__arm__) && !defined(__aarch64__) && __ARM_ARCH_PROFILE != 'A'
+#error "Not supported in the current architecture."
+#endif
+
+#if __ARM_ARCH >= 8 && (!defined(__ARM_64BIT_STATE) || !defined(__ARM_NEON))
+#error "Invalid AArch64 state."
+#endif
+
+#include <cassert>
+#include <cstdint>
+#include <type_traits>
+
+#include "common.h"
+
+#include <arm_acle.h>
+
+#ifdef __ARM_NEON
+
+#include <arm_neon.h>
+
+namespace Stockfish {
+
+template<typename T>
+inline int __neon_cnt(T n) {
+    static_assert(std::is_integral_v<T> && sizeof(T) <= 8);
+
+    uint8x8_t cnt = vcnt_u8(vcreate_u8(std::uint64_t(n)));
+
+#if __ARM_ARCH >= 8
+    return vaddv_u8(cnt);
+#else
+    return vget_lane_u64(vpaddl_u32(vpaddl_u16(vpaddl_u8(cnt))), 0);
+#endif
+}
+
+inline void vdotq_s32_v(int32x4_t& acc, int8x16_t in, int8x16_t col) {
+#ifdef __ARM_FEATURE_DOTPROD
+    acc = vdotq_s32(acc, in, col);
+#elif __ARM_ARCH >= 8
+    int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
+    int16x8_t product1 = vmull_high_s8(in, col);
+    int16x8_t sum      = vpaddq_s16(product0, product1);
+    acc                = vpadalq_s16(acc, sum);
+#else
+    int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col));
+    int16x8_t product1 = vmull_s8(vget_high_s8(in), vget_high_s8(col));
+    int16x8_t sum =
+      vcombine_s16(vqmovn_s32(vpaddlq_s16(product0)), vqmovn_s32(vpaddlq_s16(product1)));
+    acc = vpadalq_s16(acc, sum);
+#endif
+}
+
+}  // namespace Stockfish
+
+#endif  // __ARM_NEON
+
+namespace Stockfish {
+
+inline constexpr bool ArchImpl::Is64Bit = __ARM_ARCH >= 8;
+inline constexpr bool ArchImpl::UsePEXT = false;
+
+template<int Hint>
+inline void ArchImpl::prefetch([[maybe_unused]] const void* m) {}
+
+template<typename T>
+inline unsigned int ArchImpl::popcount(T n) {
+    static_assert(std::is_integral_v<T> && sizeof(T) <= 8);
+
+#ifdef __ARM_NEON
+    return __neon_cnt(n);
+#else
+    return __popcount_value(n);
+#endif
+}
+
+template<typename T>
+inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) {
+    return 0;
+}
+
+// ===========================================================================
+// The functions below are used on ARM/AArch64 targets only.
+// ===========================================================================
+
+template<typename T>
+inline int ctz(T n) {
+    static_assert(std::is_integral_v<T> && (sizeof(T) == 4 || sizeof(T) == 8));
+    assert(n != 0);
+
+    if constexpr (sizeof(T) == 8)
+        return __clzll(__rbitll(std::uint64_t(n)));
+    else
+        return __clz(__rbit(std::uint32_t(n)));
+}
+
+}  // namespace Stockfish
+
+#endif  // ARM_ARCH_H_INCLUDED
diff --git a/src/arch/arm/nnue/layers/affine_transform.h b/src/arch/arm/nnue/layers/affine_transform.h
new file mode 100644
index 00000000000..3fb69bce2d8
--- /dev/null
+++ b/src/arch/arm/nnue/layers/affine_transform.h
@@ -0,0 +1,106 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#define ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check ARM/AArch64 SIMD features.
+// If none is defined, fall back to the generic implementation.
+#ifndef __ARM_NEON
+
+#include "arch/generic/nnue/layers/affine_transform.h"
+
+#else
+
+#include "../../arch.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template<IndexType InDims, IndexType OutDims>
+constexpr IndexType AffineTransform<InDims, OutDims>::get_weight_index(IndexType i) {
+    return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4
+         + i / PaddedInputDimensions * 4 + i % 4;
+}
+
+template<IndexType InDims, IndexType OutDims>
+void AffineTransform<InDims, OutDims>::propagate(const InputType* input, OutputType* output) const {
+    if constexpr (OutputDimensions > 1)
+    {
+        static constexpr IndexType OutputLanes = 16 / sizeof(OutputType);
+        static_assert(OutputDimensions % OutputLanes == 0);
+
+        static constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
+        static constexpr IndexType NumRegs   = OutputDimensions / OutputLanes;
+
+        int32x4_t acc[NumRegs];
+
+        for (std::size_t k = 0; k < array_size(acc); ++k)
+            acc[k] = reinterpret_cast<const int32x4_t*>(biases)[k];
+
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            const int8x16_t in =
+              vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast<const std::int32_t*>(input)[i]));
+            const auto col = reinterpret_cast<const int8x16_t*>(&weights[i * OutputDimensions * 4]);
+
+            for (std::size_t k = 0; k < array_size(acc); ++k)
+                vdotq_s32_v(acc[k], in, col[k]);
+        }
+
+        for (std::size_t k = 0; k < array_size(acc); ++k)
+            reinterpret_cast<int32x4_t*>(output)[k] = acc[k];
+    }
+    else if constexpr (OutputDimensions == 1)
+    {
+        static constexpr IndexType InputLanes = 16 / sizeof(InputType);
+        static_assert(PaddedInputDimensions % InputLanes == 0);
+
+        static constexpr IndexType NumChunks = PaddedInputDimensions / InputLanes;
+
+        int32x4_t sum = vdupq_n_s32(0);
+
+        for (IndexType j = 0; j < NumChunks; ++j)
+        {
+            const int8x16_t in  = reinterpret_cast<const int8x16_t*>(input)[j];
+            const int8x16_t row = reinterpret_cast<const int8x16_t*>(weights)[j];
+            vdotq_s32_v(sum, in, row);
+        }
+
+#if __ARM_ARCH >= 8
+        output[0] = vaddvq_s32(sum) + biases[0];
+#else
+        output[0] = vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2)
+                  + vgetq_lane_s32(sum, 3) + biases[0];
+#endif
+    }
+}
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // !__ARM_NEON
+
+#endif  // ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/arch/arm/nnue/layers/affine_transform_sparse_input.h b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h
new file mode 100644
index 00000000000..546002c256f
--- /dev/null
+++ b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h
@@ -0,0 +1,129 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+#define ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include "../../arch.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+#if __ARM_ARCH >= 8
+
+alignas(CacheLineSize) static const std::array<std::array<std::uint16_t, 8>, 256> lookupIndices =
+  [] {
+      std::array<std::array<std::uint16_t, 8>, 256> array{};
+      for (std::uint64_t i = 0; i < 256; ++i)
+      {
+          std::uint64_t j = i, k = 0;
+          while (j)
+              array[i][k++] = ctz(j), j &= j - 1;
+      }
+      return array;
+  }();
+
+template<IndexType InDims, IndexType OutDims>
+class AffineTransformSparseInput: public AffineTransform<InDims, OutDims> {
+    __DEFINE_BASE_PROPERTIES
+
+    static_assert(OutputDimensions % 16 == 0,
+                  "OutputDimensions must be multiple of 16 for this layer.");
+
+   public:
+    void propagate(const InputType* input, OutputType* output) const;
+
+   private:
+    template<IndexType InputDimensions>
+    static IndexType populate_nz_indices(const std::uint8_t* input, std::uint16_t* indices) {
+        IndexType  count = 0;
+        uint16x8_t base  = vdupq_n_u16(0);
+
+        const auto in = reinterpret_cast<const int32x4_t*>(input);
+
+        for (IndexType i = 0; i < InputDimensions / 8; ++i)
+        {
+            const int32x4_t chunk0 = in[i * 2];
+            const int32x4_t chunk1 = in[i * 2 + 1];
+
+            static const uint32x4_t movemask = [] {
+                const std::uint32_t n[4] = {1, 2, 4, 8};
+                return vld1q_u32(n);
+            }();
+
+            const std::uint32_t nnz = vaddvq_u32(vandq_u32(vtstq_s32(chunk0, chunk0), movemask))
+                                    | vaddvq_u32(vandq_u32(vtstq_s32(chunk1, chunk1), movemask))
+                                        << 4;
+            const uint16x8_t offsets = *reinterpret_cast<const uint16x8_t*>(&lookupIndices[nnz]);
+            *reinterpret_cast<uint16x8_t*>(indices + count) = vaddq_u16(base, offsets);
+            count += popcount(nnz);
+            base = vaddq_u16(base, vdupq_n_u16(8));
+        }
+
+        return count;
+    }
+};
+
+template<IndexType InDims, IndexType OutDims>
+void AffineTransformSparseInput<InDims, OutDims>::propagate(const InputType* input,
+                                                            OutputType*      output) const {
+    static constexpr IndexType OutputLanes = 16 / sizeof(OutputType);
+
+    static constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
+    static constexpr IndexType NumRegs   = OutputDimensions / OutputLanes;
+
+    int32x4_t     acc[NumRegs];
+    std::uint16_t nnz[NumChunks];
+    IndexType     count = populate_nz_indices<NumChunks>(input, nnz);
+
+    for (std::size_t k = 0; k < array_size(acc); ++k)
+        acc[k] = reinterpret_cast<const int32x4_t*>(biases)[k];
+
+    for (IndexType j = 0; j < count; ++j)
+    {
+        const auto      i = nnz[j];
+        const int8x16_t in =
+          vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast<const std::int32_t*>(input)[i]));
+        const auto col = reinterpret_cast<const int8x16_t*>(&weights[i * OutputDimensions * 4]);
+        for (std::size_t k = 0; k < array_size(acc); ++k)
+            vdotq_s32_v(acc[k], in, col[k]);
+    }
+
+    for (std::size_t k = 0; k < array_size(acc); ++k)
+        reinterpret_cast<int32x4_t*>(output)[k] = acc[k];
+}
+
+#else
+
+template<IndexType InDims, IndexType OutDims>
+using AffineTransformSparseInput = AffineTransform<InDims, OutDims>;
+
+#endif  // __ARM_ARCH >= 8
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
diff --git a/src/arch/arm/nnue/layers/clipped_relu.h b/src/arch/arm/nnue/layers/clipped_relu.h
new file mode 100644
index 00000000000..7e644ec5060
--- /dev/null
+++ b/src/arch/arm/nnue/layers/clipped_relu.h
@@ -0,0 +1,72 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#define ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check ARM/AArch64 SIMD features.
+// If none is defined, fall back to the generic implementation.
+#ifndef __ARM_NEON
+
+#include "arch/generic/nnue/layers/clipped_relu.h"
+
+#else
+
+#include "../../arch.h"
+
+#include <cstdint>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template<IndexType InDims>
+void ClippedReLU<InDims>::propagate(const InputType* input, OutputType* output) const {
+    static constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(OutputDimensions, 16) / 8;
+
+    const auto in  = reinterpret_cast<const int32x4_t*>(input);
+    const auto out = reinterpret_cast<int8x8_t*>(output);
+
+    for (IndexType i = 0; i < NumChunks; ++i)
+    {
+#if __ARM_ARCH >= 8
+        int16x4_t words0 = vqshrn_n_s32(in[i * 2], WeightScaleBits);
+        int16x8_t words  = vqshrn_high_n_s32(words0, in[i * 2 + 1], WeightScaleBits);
+        out[i]           = vmax_s8(vqmovn_s16(words), vdup_n_s8(0));
+#else
+        union {
+            int16x4x2_t tuple;
+            int16x8_t   all;
+        } words;
+
+        words.tuple.val[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
+        words.tuple.val[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
+        out[i]             = vmax_s8(vqmovn_s16(words.all), vdup_n_s8(0));
+#endif
+    }
+}
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // !__ARM_NEON
+
+#endif  // ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/arm/nnue/layers/sqr_clipped_relu.h b/src/arch/arm/nnue/layers/sqr_clipped_relu.h
new file mode 100644
index 00000000000..f67388c6fac
--- /dev/null
+++ b/src/arch/arm/nnue/layers/sqr_clipped_relu.h
@@ -0,0 +1,29 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#define ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// lazy
+#include "arch/generic/nnue/layers/sqr_clipped_relu.h"
+
+#endif  // ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/arm/nnue/nnue_feature_transformer.h b/src/arch/arm/nnue/nnue_feature_transformer.h
new file mode 100644
index 00000000000..fe0b077f008
--- /dev/null
+++ b/src/arch/arm/nnue/nnue_feature_transformer.h
@@ -0,0 +1,343 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#define ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+
+#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check ARM/AArch64 SIMD features.
+// If none is defined, fall back to the generic implementation.
+#ifndef __ARM_NEON
+
+#include "arch/generic/nnue/nnue_feature_transformer.h"
+
+#else
+
+#include "../arch.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "misc.h"
+#include "position.h"
+#include "types.h"
+#include "nnue/nnue_accumulator.h"
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE {
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+struct FeatureTransformer<TransformedFeatureDimensions, accPtr>::Details {
+   private:
+    static constexpr int NumQReg = 16;
+
+   public:
+    static constexpr int OptimalAccRegisterCount =
+      optimal_register_count<16, NumQReg, sizeof(WeightType), TransformedFeatureDimensions>();
+    static constexpr int OptimalPSQTRegisterCount =
+      optimal_register_count<16, NumQReg, sizeof(PSQTWeightType), PSQTBuckets>();
+
+    static constexpr IndexType TileHeight = OptimalAccRegisterCount * 16 / sizeof(WeightType);
+    static constexpr IndexType PsqtTileHeight =
+      OptimalPSQTRegisterCount * 16 / sizeof(PSQTWeightType);
+
+    static_assert(HalfDimensions % TileHeight == 0,
+                  "HalfDimensions must be multiple of TileHeight");
+    static_assert(PSQTBuckets % PsqtTileHeight == 0,
+                  "PSQTBuckets must be multiple of PsqtTileHeight");
+};
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<bool Write>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::permute_weights() {}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<Color Perspective, bool CurrentOnly>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::
+  apply_accumulator_updates_incremental(StateInfo*             computed,
+                                        StateInfo*             next,
+                                        FeatureSet::IndexList& removed,
+                                        FeatureSet::IndexList& added) const {
+    // The most common case when updating the accumulator incrementally.
+    // Calculates feature differences directly without using tiling mechanism.
+    if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1)
+    {
+        const auto accIn =
+          reinterpret_cast<const int16x8_t*>(&(computed->*accPtr).accumulation[Perspective][0]);
+        const auto accOut =
+          reinterpret_cast<int16x8_t*>(&(next->*accPtr).accumulation[Perspective][0]);
+
+        const IndexType offsetR0 = HalfDimensions * removed[0];
+        const auto      columnR0 = reinterpret_cast<const int16x8_t*>(&weights[offsetR0]);
+        const IndexType offsetA  = HalfDimensions * added[0];
+        const auto      columnA  = reinterpret_cast<const int16x8_t*>(&weights[offsetA]);
+
+        if (removed.size() == 1)
+        {
+            for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / 16; ++i)
+                accOut[i] = vaddq_s16(vsubq_s16(accIn[i], columnR0[i]), columnA[i]);
+        }
+        else
+        {
+            const IndexType offsetR1 = HalfDimensions * removed[1];
+            const auto      columnR1 = reinterpret_cast<const int16x8_t*>(&weights[offsetR1]);
+
+            for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / 16; ++i)
+                accOut[i] =
+                  vsubq_s16(vaddq_s16(accIn[i], columnA[i]), vaddq_s16(columnR0[i], columnR1[i]));
+        }
+
+        const auto accPsqtIn =
+          reinterpret_cast<const int32x4_t*>(&(computed->*accPtr).psqtAccumulation[Perspective][0]);
+        const auto accPsqtOut =
+          reinterpret_cast<int32x4_t*>(&(next->*accPtr).psqtAccumulation[Perspective][0]);
+
+        const IndexType offsetPsqtR0 = PSQTBuckets * removed[0];
+        auto columnPsqtR0 = reinterpret_cast<const int32x4_t*>(&psqtWeights[offsetPsqtR0]);
+        const IndexType offsetPsqtA = PSQTBuckets * added[0];
+        auto            columnPsqtA = reinterpret_cast<const int32x4_t*>(&psqtWeights[offsetPsqtA]);
+
+        if (removed.size() == 1)
+        {
+            for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / 16; ++i)
+                accPsqtOut[i] = vaddq_s32(vsubq_s32(accPsqtIn[i], columnPsqtR0[i]), columnPsqtA[i]);
+        }
+        else
+        {
+            const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
+            const auto      columnPsqtR1 =
+              reinterpret_cast<const int32x4_t*>(&psqtWeights[offsetPsqtR1]);
+
+            for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / 16; ++i)
+                accPsqtOut[i] = vsubq_s32(vaddq_s32(accPsqtIn[i], columnPsqtA[i]),
+                                          vaddq_s32(columnPsqtR0[i], columnPsqtR1[i]));
+        }
+    }
+    else
+    {
+        int16x8_t acc[Details::OptimalAccRegisterCount];
+
+        for (IndexType i = 0; i < HalfDimensions / Details::TileHeight; ++i)
+        {
+            const IndexType offsetRow = i * Details::TileHeight;
+
+            const auto accTileIn = reinterpret_cast<const int16x8_t*>(
+              &(computed->*accPtr).accumulation[Perspective][offsetRow]);
+            for (std::size_t j = 0; j < array_size(acc); ++j)
+                acc[j] = accTileIn[j];
+
+            for (const auto index : removed)
+            {
+                const IndexType offset = HalfDimensions * index + offsetRow;
+                const auto      column = reinterpret_cast<const int16x8_t*>(&weights[offset]);
+                for (std::size_t j = 0; j < array_size(acc); ++j)
+                    acc[j] = vsubq_s16(acc[j], column[j]);
+            }
+
+            for (const auto index : added)
+            {
+                const IndexType offset = HalfDimensions * index + offsetRow;
+                const auto      column = reinterpret_cast<const int16x8_t*>(&weights[offset]);
+                for (std::size_t j = 0; j < array_size(acc); ++j)
+                    acc[j] = vaddq_s16(acc[j], column[j]);
+            }
+
+            const auto accTileOut =
+              reinterpret_cast<int16x8_t*>(&(next->*accPtr).accumulation[Perspective][offsetRow]);
+            for (std::size_t j = 0; j < array_size(acc); ++j)
+                accTileOut[j] = acc[j];
+        }
+
+        int32x4_t psqt[Details::OptimalPSQTRegisterCount];
+
+        for (IndexType i = 0; i < PSQTBuckets / Details::PsqtTileHeight; ++i)
+        {
+            const IndexType offsetRow = i * Details::PsqtTileHeight;
+
+            auto accTilePsqtIn = reinterpret_cast<const int32x4_t*>(
+              &(computed->*accPtr).psqtAccumulation[Perspective][offsetRow]);
+            for (std::size_t j = 0; j < array_size(psqt); ++j)
+                psqt[j] = accTilePsqtIn[j];
+
+            for (const auto index : removed)
+            {
+                const IndexType offset = PSQTBuckets * index + offsetRow;
+                auto columnPsqt        = reinterpret_cast<const int32x4_t*>(&psqtWeights[offset]);
+                for (std::size_t j = 0; j < array_size(psqt); ++j)
+                    psqt[j] = vsubq_s32(psqt[j], columnPsqt[j]);
+            }
+
+            for (const auto index : added)
+            {
+                const IndexType offset = PSQTBuckets * index + offsetRow;
+                auto columnPsqt        = reinterpret_cast<const int32x4_t*>(&psqtWeights[offset]);
+                for (std::size_t j = 0; j < array_size(psqt); ++j)
+                    psqt[j] = vaddq_s32(psqt[j], columnPsqt[j]);
+            }
+
+            auto accTilePsqtOut = reinterpret_cast<int32x4_t*>(
+              &(next->*accPtr).psqtAccumulation[Perspective][offsetRow]);
+            for (std::size_t j = 0; j < array_size(psqt); ++j)
+                accTilePsqtOut[j] = psqt[j];
+        }
+    }
+}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<Color Perspective>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::
+  apply_accumulator_updates_refresh_cache(
+    Accumulator<TransformedFeatureDimensions>&                accumulator,
+    typename AccumulatorCaches::Cache<HalfDimensions>::Entry& entry,
+    FeatureSet::IndexList                                     removed,
+    FeatureSet::IndexList                                     added) const {
+    int16x8_t acc[Details::OptimalAccRegisterCount];
+
+    for (IndexType j = 0; j < HalfDimensions / Details::TileHeight; ++j)
+    {
+        const IndexType offsetRow = j * Details::TileHeight;
+
+        const auto accTile =
+          reinterpret_cast<int16x8_t*>(&accumulator.accumulation[Perspective][offsetRow]);
+        const auto entryTile = reinterpret_cast<int16x8_t*>(&entry.accumulation[offsetRow]);
+
+        for (IndexType k = 0; k < array_size(acc); ++k)
+            acc[k] = entryTile[k];
+
+        std::size_t i = 0;
+        for (; i < std::min(removed.size(), added.size()); ++i)
+        {
+            const IndexType offsetR = HalfDimensions * removed[i] + offsetRow;
+            const auto      columnR = reinterpret_cast<const int16x8_t*>(&weights[offsetR]);
+            const IndexType offsetA = HalfDimensions * added[i] + offsetRow;
+            const auto      columnA = reinterpret_cast<const int16x8_t*>(&weights[offsetA]);
+
+            for (std::size_t k = 0; k < array_size(acc); ++k)
+                acc[k] = vaddq_s16(acc[k], vsubq_s16(columnA[k], columnR[k]));
+        }
+        for (; i < removed.size(); ++i)
+        {
+            const IndexType offset = HalfDimensions * removed[i] + offsetRow;
+            const auto      column = reinterpret_cast<const int16x8_t*>(&weights[offset]);
+
+            for (std::size_t k = 0; k < array_size(acc); ++k)
+                acc[k] = vsubq_s16(acc[k], column[k]);
+        }
+        for (; i < added.size(); ++i)
+        {
+            const IndexType offset = HalfDimensions * added[i] + offsetRow;
+            const auto      column = reinterpret_cast<const int16x8_t*>(&weights[offset]);
+
+            for (std::size_t k = 0; k < array_size(acc); ++k)
+                acc[k] = vaddq_s16(acc[k], column[k]);
+        }
+
+        for (std::size_t k = 0; k < array_size(acc); k++)
+            entryTile[k] = acc[k];
+        for (std::size_t k = 0; k < array_size(acc); k++)
+            accTile[k] = acc[k];
+    }
+
+    int32x4_t psqt[Details::OptimalPSQTRegisterCount];
+
+    for (IndexType j = 0; j < PSQTBuckets / Details::PsqtTileHeight; ++j)
+    {
+        const IndexType offsetRow = j * Details::PsqtTileHeight;
+
+        const auto accTilePsqt =
+          reinterpret_cast<int32x4_t*>(&accumulator.psqtAccumulation[Perspective][offsetRow]);
+        const auto entryTilePsqt = reinterpret_cast<int32x4_t*>(&entry.psqtAccumulation[offsetRow]);
+
+        for (std::size_t k = 0; k < array_size(psqt); ++k)
+            psqt[k] = entryTilePsqt[k];
+
+        for (std::size_t i = 0; i < removed.size(); ++i)
+        {
+            const IndexType offset     = PSQTBuckets * removed[i] + offsetRow;
+            const auto      columnPsqt = reinterpret_cast<const int32x4_t*>(&psqtWeights[offset]);
+
+            for (std::size_t k = 0; k < array_size(psqt); ++k)
+                psqt[k] = vsubq_s32(psqt[k], columnPsqt[k]);
+        }
+        for (std::size_t i = 0; i < added.size(); ++i)
+        {
+            const IndexType offset     = PSQTBuckets * added[i] + offsetRow;
+            const auto      columnPsqt = reinterpret_cast<const int32x4_t*>(&psqtWeights[offset]);
+
+            for (std::size_t k = 0; k < array_size(psqt); ++k)
+                psqt[k] = vaddq_s32(psqt[k], columnPsqt[k]);
+        }
+
+        for (std::size_t k = 0; k < array_size(psqt); ++k)
+            entryTilePsqt[k] = psqt[k];
+        for (std::size_t k = 0; k < array_size(psqt); ++k)
+            accTilePsqt[k] = psqt[k];
+    }
+}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::convert_accumulators(
+  const Position& pos, OutputType* output) const {
+    static constexpr IndexType OutputChunkSize = 16 / sizeof(OutputType);
+    static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+
+    static constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
+
+    const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+    const auto& accumulation    = (pos.state()->*accPtr).accumulation;
+
+    for (IndexType p = 0; p < 2; ++p)
+    {
+        const auto in0 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][0]));
+        const auto in1 =
+          reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+        const auto out = reinterpret_cast<uint8x16_t*>(&output[(HalfDimensions / 2) * p]);
+
+        for (IndexType j = 0; j < NumOutputChunks; ++j)
+        {
+            static const int16x8_t Zeroes = vdupq_n_s16(0);
+            static const int16x8_t Ones   = vdupq_n_s16(127 * 2);
+
+            const int16x8_t sum0a =
+              vshlq_n_s16(vmaxq_s16(vminq_s16(in0[j * 2 + 0], Ones), Zeroes), 6);
+            const int16x8_t sum0b =
+              vshlq_n_s16(vmaxq_s16(vminq_s16(in0[j * 2 + 1], Ones), Zeroes), 6);
+            const int16x8_t sum1a = vminq_s16(in1[j * 2 + 0], Ones);
+            const int16x8_t sum1b = vminq_s16(in1[j * 2 + 1], Ones);
+
+            const int16x8_t pa = vqdmulhq_s16(sum0a, sum1a);
+            const int16x8_t pb = vqdmulhq_s16(sum0b, sum1b);
+
+            out[j] = vcombine_u8(vqmovun_s16(pa), vqmovun_s16(pb));
+        }
+    }
+}
+
+}  // namespace Stockfish::Eval::NNUE
+
+#endif  // !__ARM_NEON
+
+#endif  // ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/arch/generic/Makefile b/src/arch/generic/Makefile
new file mode 100644
index 00000000000..230916e41b7
--- /dev/null
+++ b/src/arch/generic/Makefile
@@ -0,0 +1,32 @@
+define GENERIC_HELP_STRING
+To build Stockfish, run the following command:
+
+make <build-target> [ARCH=<preset>] [COMP=<compiler>]
+
+Build presets for other architectures:
+
+general-64              > 64-bit generic
+general-32              > 32-bit generic
+
+To see architecture-specific build options, run 'make help ARCH=<arch>'.
+Currently supported values: x86, arm, generic
+
+endef
+export GENERIC_HELP_STRING
+
+ifeq ($(COMP),$(filter $(COMP),gcc clang icx))
+
+ifeq ($(ARCH),general-64)
+    CXXFLAGS += -m64
+else ifeq ($(ARCH),general-32)
+    CXXFLAGS += -m32
+endif
+
+endif
+
+help-arch:
+	@echo "$${GENERIC_HELP_STRING}"
+
+config-sanity-arch:
+	@test "$(ARCH)" = "general-64" || \
+	 test "$(ARCH)" = "general-32"
diff --git a/src/arch/generic/arch.h b/src/arch/generic/arch.h
new file mode 100644
index 00000000000..3641ab6f3f2
--- /dev/null
+++ b/src/arch/generic/arch.h
@@ -0,0 +1,51 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef GENERIC_ARCH_H_INCLUDED
+#define GENERIC_ARCH_H_INCLUDED
+
+#include <cstdint>
+#include <type_traits>
+
+#include "common.h"
+
+namespace Stockfish {
+
+// There is no practical way to detect the register width, so we assume that
+// it is always 64-bit if address size is 64-bit.
+inline constexpr bool ArchImpl::Is64Bit = sizeof(void*) == 8;
+
+inline constexpr bool ArchImpl::UsePEXT = false;
+
+template<int Hint>
+inline void ArchImpl::prefetch([[maybe_unused]] const void* m) {}
+
+template<typename T>
+inline unsigned int ArchImpl::popcount(T n) {
+    static_assert(std::is_integral_v<T> && sizeof(T) <= 8);
+    return __popcount_value(n);
+}
+
+template<typename T>
+inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) {
+    return 0;
+}
+
+}  // namespace Stockfish
+
+#endif  // GENERIC_ARCH_H_INCLUDED
diff --git a/src/arch/generic/nnue/layers/affine_transform.h b/src/arch/generic/nnue/layers/affine_transform.h
new file mode 100644
index 00000000000..341816d3801
--- /dev/null
+++ b/src/arch/generic/nnue/layers/affine_transform.h
@@ -0,0 +1,59 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#define GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include <cstring>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template<IndexType InDims, IndexType OutDims>
+constexpr IndexType AffineTransform<InDims, OutDims>::get_weight_index(IndexType i) {
+    return i;
+}
+
+template<IndexType InDims, IndexType OutDims>
+void AffineTransform<InDims, OutDims>::propagate(const InputType* input, OutputType* output) const {
+    std::memcpy(output, biases, sizeof(OutputType) * OutputDimensions);
+
+    // Traverse weights in transpose order to take advantage of input sparsity
+    for (IndexType i = 0; i < InputDimensions; ++i)
+    {
+        const InputType in = input[i];
+        if (in)
+        {
+            const WeightType* w = &weights[i];
+            for (IndexType j = 0; j < OutputDimensions; ++j)
+                output[j] += w[j * PaddedInputDimensions] * in;
+        }
+    }
+}
+
+template<IndexType InDims, IndexType OutDims>
+using AffineTransformSparseInput = AffineTransform<InDims, OutDims>;
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/arch/generic/nnue/layers/clipped_relu.h b/src/arch/generic/nnue/layers/clipped_relu.h
new file mode 100644
index 00000000000..ba965b9b8e7
--- /dev/null
+++ b/src/arch/generic/nnue/layers/clipped_relu.h
@@ -0,0 +1,40 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#define GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include <algorithm>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template<IndexType InDims>
+void ClippedReLU<InDims>::propagate(const InputType* input, OutputType* output) const {
+    for (IndexType i = 0; i < InputDimensions; ++i)
+        output[i] = static_cast<OutputType>(std::clamp(input[i] >> WeightScaleBits, 0, 127));
+}
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/generic/nnue/layers/sqr_clipped_relu.h b/src/arch/generic/nnue/layers/sqr_clipped_relu.h
new file mode 100644
index 00000000000..5048df5406a
--- /dev/null
+++ b/src/arch/generic/nnue/layers/sqr_clipped_relu.h
@@ -0,0 +1,45 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#define GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include <algorithm>
+#include <cstdint>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template<IndexType InDims>
+void SqrClippedReLU<InDims>::propagate(const InputType* input, OutputType* output) const {
+    // The correct formula is to divide by 127, but we need to make it fast
+    // therefore right-shift by extra 7 bits is used instead. Needs to be
+    // accounted for in the trainer.
+    for (IndexType i = 0; i < InputDimensions; ++i)
+        output[i] = static_cast<OutputType>(std::min(
+          std::int64_t(127), std::int64_t(input[i]) * input[i] >> (2 * WeightScaleBits + 7)));
+}
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/generic/nnue/nnue_feature_transformer.h b/src/arch/generic/nnue/nnue_feature_transformer.h
new file mode 100644
index 00000000000..f885980541a
--- /dev/null
+++ b/src/arch/generic/nnue/nnue_feature_transformer.h
@@ -0,0 +1,146 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#define GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+
+#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include <algorithm>
+#include <cstring>
+
+#include "position.h"
+#include "nnue/nnue_accumulator.h"
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE {
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<bool Write>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::permute_weights() {}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<Color Perspective, bool CurrentOnly>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::
+  apply_accumulator_updates_incremental(StateInfo*             computed,
+                                        StateInfo*             next,
+                                        FeatureSet::IndexList& removed,
+                                        FeatureSet::IndexList& added) const {
+
+    std::memcpy((next->*accPtr).accumulation[Perspective],
+                (computed->*accPtr).accumulation[Perspective], HalfDimensions * sizeof(BiasType));
+    std::memcpy((next->*accPtr).psqtAccumulation[Perspective],
+                (computed->*accPtr).psqtAccumulation[Perspective],
+                PSQTBuckets * sizeof(PSQTWeightType));
+
+    // Difference calculation for the deactivated features
+    for (const auto index : removed)
+    {
+        const IndexType wOffset  = HalfDimensions * index;
+        const IndexType pwOffset = PSQTBuckets * index;
+
+        for (IndexType i = 0; i < HalfDimensions; ++i)
+            (next->*accPtr).accumulation[Perspective][i] -= weights[wOffset + i];
+
+        for (IndexType i = 0; i < PSQTBuckets; ++i)
+            (next->*accPtr).psqtAccumulation[Perspective][i] -= psqtWeights[pwOffset + i];
+    }
+
+    // Difference calculation for the activated features
+    for (const auto index : added)
+    {
+        const IndexType wOffset  = HalfDimensions * index;
+        const IndexType pwOffset = PSQTBuckets * index;
+
+        for (IndexType i = 0; i < HalfDimensions; ++i)
+            (next->*accPtr).accumulation[Perspective][i] += weights[wOffset + i];
+
+        for (IndexType i = 0; i < PSQTBuckets; ++i)
+            (next->*accPtr).psqtAccumulation[Perspective][i] += psqtWeights[pwOffset + i];
+    }
+}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<Color Perspective>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::
+  apply_accumulator_updates_refresh_cache(
+    Accumulator<TransformedFeatureDimensions>&                accumulator,
+    typename AccumulatorCaches::Cache<HalfDimensions>::Entry& entry,
+    FeatureSet::IndexList                                     removed,
+    FeatureSet::IndexList                                     added) const {
+    for (const auto index : removed)
+    {
+        const IndexType wOffset  = HalfDimensions * index;
+        const IndexType pwOffset = PSQTBuckets * index;
+
+        for (IndexType j = 0; j < HalfDimensions; ++j)
+            entry.accumulation[j] -= weights[wOffset + j];
+
+        for (IndexType k = 0; k < PSQTBuckets; ++k)
+            entry.psqtAccumulation[k] -= psqtWeights[pwOffset + k];
+    }
+    for (const auto index : added)
+    {
+        const IndexType wOffset  = HalfDimensions * index;
+        const IndexType pwOffset = PSQTBuckets * index;
+
+        for (IndexType j = 0; j < HalfDimensions; ++j)
+            entry.accumulation[j] += weights[wOffset + j];
+
+        for (IndexType k = 0; k < PSQTBuckets; ++k)
+            entry.psqtAccumulation[k] += psqtWeights[pwOffset + k];
+    }
+
+    // The accumulator of the refresh entry has been updated.
+    // Now copy its content to the actual accumulator we were refreshing.
+    std::memcpy(accumulator.accumulation[Perspective], entry.accumulation,
+                sizeof(BiasType) * HalfDimensions);
+    std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation,
+                sizeof(PSQTWeightType) * PSQTBuckets);
+}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::convert_accumulators(
+  const Position& pos, OutputType* output) const {
+    const int   perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+    const auto& accumulation    = (pos.state()->*accPtr).accumulation;
+
+    for (IndexType p = 0; p < 2; ++p)
+    {
+        const IndexType offset = (HalfDimensions / 2) * p;
+
+        for (IndexType j = 0; j < HalfDimensions / 2; ++j)
+        {
+            BiasType sum0      = accumulation[perspectives[p]][j];
+            BiasType sum1      = accumulation[perspectives[p]][j + HalfDimensions / 2];
+            sum0               = std::clamp<BiasType>(sum0, 0, 127 * 2);
+            sum1               = std::clamp<BiasType>(sum1, 0, 127 * 2);
+            output[offset + j] = static_cast<OutputType>(unsigned(sum0 * sum1) / 512);
+        }
+    }
+}
+
+}  // namespace Stockfish::Eval::NNUE
+
+#endif  // GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/arch/i386/Makefile b/src/arch/i386/Makefile
new file mode 100644
index 00000000000..f9a97330a2a
--- /dev/null
+++ b/src/arch/i386/Makefile
@@ -0,0 +1,116 @@
+define HELP_STRING_I386
+To build Stockfish, run the following command:
+
+make <build-target> [ARCH=<preset>] [NO_AVX512=1]
+
+Build presets for x86 architecture:
+
+x86-64-avx512-vnni      > 64-bit, AVX-512 with BW/VL/VNNI
+x86-64-avx512           > 64-bit, AVX-512 with BW/VL
+x86-64-avxvnni          > 64-bit, AVX2 with AVX-VNNI
+x86-64-bmi2             > 64-bit, AVX2 and BMI2
+x86-64-avx2             > 64-bit, AVX2
+x86-64-avx              > 64-bit, AVX
+x86-64-popcnt           > 64-bit, POPCNT with SSE4.2
+x86-64-sse41            > 64-bit, SSE4.1
+x86-64-ssse3            > 64-bit, SSSE3
+x86-64                  > 64-bit generic (supports up to SSE2)
+x86-32                  > 32-bit generic (supports up to SSE2)
+
+Build presets for x86 architecture, AMD processors:
+
+x86-64-bmi              > 64-bit, AVX with BMI
+x86-64-abm              > 64-bit, SSE4a with ABM
+
+x86 options:
+
+    NO_AVX512
+
+If NO_AVX512 is set, Stockfish will use 256-bit SIMD instructions instead of
+512-bit ones. This option is only available if AVX-512 is enabled.
+
+endef
+export HELP_STRING_I386
+
+# Compatibility layer for old Makefile presets
+ifeq ($(ARCH),x86-64-vnni512)
+    override ARCH := x86-64-avx512-vnni
+endif
+
+ifeq ($(ARCH),x86-64-vnni256)
+    override ARCH := x86-64-avx512-vnni
+    override NO_AVX512 := 1
+endif
+
+ifneq ($(filter $(COMP),gcc clang icx),)
+
+ifeq ($(ARCH),x86-64-avx512-vnni)
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+                   -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavx512f -mavx512bw \
+                   -mavx512vl -mavx512vnni
+    ifeq ($(NO_AVX512),1)
+        SF_CXXFLAGS += -mprefer-vector-width=256 -DNO_AVX512
+    endif
+else ifeq ($(ARCH),x86-64-avx512)
+    # Xeon Phi is no longer supported on future compiler versions.
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+                   -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavx512f -mavx512bw \
+                   -mavx512vl
+    ifeq ($(NO_AVX512),1)
+        SF_CXXFLAGS += -mprefer-vector-width=256 -DNO_AVX512
+    endif
+else ifeq ($(ARCH),x86-64-avxvnni)
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+                   -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavxvnni
+else ifeq ($(ARCH),x86-64-bmi2)
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+                   -mpopcnt -mavx -mbmi -mavx2 -mbmi2
+else ifeq ($(ARCH),x86-64-avx2)
+    # VIA Eden X4 does not support BMI/BMI2, need a custom profile if required.
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+                   -mpopcnt -mavx -mbmi -mavx2
+else ifeq ($(ARCH),x86-64-bmi)
+    # Some AMD processors (AMD family 15h) have BMI1 only.
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+                   -mpopcnt -mavx -mbmi
+else ifeq ($(ARCH),x86-64-avx)
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+                   -mpopcnt -mavx
+else ifeq ($(ARCH),x86-64-popcnt)
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \
+                   -mpopcnt
+else ifeq ($(ARCH),x86-64-sse41)
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1
+else ifeq ($(ARCH),x86-64-ssse3)
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3
+else ifeq ($(ARCH),x86-64-abm)
+    SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -msse4a -mabm
+else ifeq ($(ARCH),x86-64)
+    # Modern compilers add SSE2 flag by default.
+    SF_CXXFLAGS += -m64
+else ifeq ($(ARCH),x86-32)
+    SF_CXXFLAGS += -m32
+endif
+
+endif   # gcc clang icx
+
+.PHONY: help-arch config-sanity-arch
+
+help-arch:
+	@echo "$${HELP_STRING_I386}"
+
+config-sanity-arch:
+	@[ "$(ARCH)" = "x86-64-avx512-vnni" -o \
+	   "$(ARCH)" = "x86-64-avx512" -o \
+	   "$(ARCH)" = "x86-64-avxvnni" -o \
+	   "$(ARCH)" = "x86-64-bmi2" -o \
+	   "$(ARCH)" = "x86-64-avx2" -o \
+	   "$(ARCH)" = "x86-64-bmi" -o \
+	   "$(ARCH)" = "x86-64-avx" -o \
+	   "$(ARCH)" = "x86-64-popcnt" -o \
+	   "$(ARCH)" = "x86-64-sse41" -o \
+	   "$(ARCH)" = "x86-64-ssse3" -o \
+	   "$(ARCH)" = "x86-64-abm" -o \
+	   "$(ARCH)" = "x86-64" -o \
+	   "$(ARCH)" = "x86-32" \
+	]
diff --git a/src/arch/i386/arch.h b/src/arch/i386/arch.h
new file mode 100644
index 00000000000..1ef79088240
--- /dev/null
+++ b/src/arch/i386/arch.h
@@ -0,0 +1,595 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef I386_ARCH_H_INCLUDED
+#define I386_ARCH_H_INCLUDED
+
+#if !defined(__i386__) && !defined(__amd64__)
+#error "Not supported in the current architecture."
+#endif
+
+#include <cassert>
+#include <cstdint>
+#include <type_traits>
+
+#include "common.h"
+
+#if defined(__AVX__)
+
+#include <immintrin.h>
+
+#elif defined(__SSE4_1__)
+
+#include <smmintrin.h>
+
+#elif defined(__SSSE3__)
+
+#include <tmmintrin.h>
+
+#elif defined(__SSE2__)
+
+#include <emmintrin.h>
+
+// Some AMD processors with ABM do not support SSSE3/SSE4.1.
+#if defined(__POPCNT__)
+#include <popcntintrin.h>
+#endif
+
+#elif defined(__SSE__)
+
+#include <xmmintrin.h>
+
+#endif
+
+#ifdef ARCH_NATIVE
+
+// Do not use BMI2 PDEP/PEXT on AMD Zen 1/2.
+#if defined(__BMI2__) && (defined(__znver1__) || defined(__znver2__))
+#define NO_PDEP_PEXT 1
+#endif
+
+// Enable AVX-512 on AMD Zen 5 only.
+#if defined(__AVX512F__) && !defined(__znver5__)
+#define NO_AVX512 1
+#endif
+
+#endif  // ARCH_NATIVE
+
+namespace Stockfish {
+
+enum class PrefetchHint {
+    ET0 = 7,
+    T0  = 3,
+    T1  = 2,
+    T2  = 1,
+    NTA = 0
+};
+
+// Register size is equal to address bits.
+inline constexpr bool ArchImpl::Is64Bit = sizeof(void*) == 8;
+
+#if defined(__BMI2__) && !defined(NO_PDEP_PEXT)
+inline constexpr bool ArchImpl::UsePEXT = true;
+#else
+inline constexpr bool ArchImpl::UsePEXT = false;
+#endif
+
+template<int Hint>
+inline void ArchImpl::prefetch([[maybe_unused]] const void* m) {
+#ifdef __SSE__
+    constexpr int __Hint = [] {
+        if constexpr (Hint == -1)
+            return 3;
+        else
+#ifdef __PRFCHW__
+            return (Hint & 0x4) ? 7 : (Hint & 0x3);
+#else
+            return Hint & 0x3;
+#endif
+    }();
+
+    // GCC doesn't comply with Intel Intrinsics Guide and uses enum instead
+    // of int.
+#if STOCKFISH_COMPILER == STOCKFISH_COMPILER_GCC
+    _mm_prefetch(m, [] {
+        if constexpr (__Hint == 7)
+            return _MM_HINT_ET0;
+        else if constexpr (__Hint == 3)
+            return _MM_HINT_T0;
+        else if constexpr (__Hint == 2)
+            return _MM_HINT_T1;
+        else if constexpr (__Hint == 1)
+            return _MM_HINT_T2;
+        else
+            return _MM_HINT_NTA;
+    }());
+#else
+    _mm_prefetch(m, __Hint);
+#endif
+
+#endif  // __SSE__
+}
+
+template<typename T>
+inline unsigned int ArchImpl::popcount(T n) {
+    static_assert(std::is_integral_v<T> && sizeof(T) <= 8);
+
+#ifdef __POPCNT__
+    if constexpr (sizeof(T) == 8)
+        return _mm_popcnt_u64(std::uint64_t(n));
+    else
+        return _mm_popcnt_u32(std::uint32_t(n));
+#else
+    if constexpr (!is_64bit() && sizeof(T) == 8)
+        return __popcount_use_table(n);
+    else
+        return __popcount_value(n);
+#endif
+}
+
+template<typename T>
+inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) {
+#if defined(__BMI2__) && !defined(NO_PDEP_PEXT)
+    static_assert(std::is_integral_v<T> && (sizeof(T) == 4 || sizeof(T) == 8));
+
+    if constexpr (sizeof(T) == 8)
+        return _pext_u64(std::uint64_t(n), std::uint64_t(mask));
+    else
+        return _pext_u32(std::uint32_t(n), std::uint32_t(mask));
+#else
+    return 0;
+#endif
+}
+
+// ===========================================================================
+// The functions below are used on i386/AMD64 targets only.
+// ===========================================================================
+
+template<typename T>
+inline std::make_unsigned_t<T> blsr(T n) {
+    static_assert(std::is_integral_v<T> && (sizeof(T) == 4 || sizeof(T) == 8));
+
+#ifdef __BMI__
+    if constexpr (sizeof(T) == 8)
+        return _blsr_u64(std::uint64_t(n));
+    else
+        return _blsr_u32(std::uint32_t(n));
+#else
+    return std::make_unsigned_t<T>(n) & std::make_unsigned_t<T>(n - 1);
+#endif
+}
+
+template<typename T>
+inline int tzcnt(T n) {
+    static_assert(std::is_integral_v<T> && (sizeof(T) == 4 || sizeof(T) == 8));
+
+#ifdef __BMI__
+    if constexpr (sizeof(T) == 8)
+        return _tzcnt_u64(std::uint64_t(n));
+    else
+        return _tzcnt_u32(std::uint32_t(n));
+#else
+    assert(n != 0);
+
+    if constexpr (sizeof(T) == 8)
+        return __builtin_ctzll(n);
+    else
+        return __builtin_ctz(n);
+#endif
+}
+
+#ifdef __SSE2__
+
+template<typename T>
+struct is_valid_vector {
+    static constexpr bool value = sizeof(T) == 16
+#ifdef __AVX2__
+                               || sizeof(T) == 32
+#endif
+#ifdef __AVX512F__
+                               || sizeof(T) == 64
+#endif
+      ;
+};
+
+template<typename T>
+inline constexpr bool is_valid_vector_v = is_valid_vector<T>::value;
+
+template<typename T>
+inline T _mm_setzero_v() {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+        return _mm512_setzero_si512();
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+        return _mm256_setzero_si256();
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_setzero_si128();
+}
+
+template<typename T>
+inline T _mm_set1_epi16_v(std::uint16_t n) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+        return _mm512_set1_epi16(n);
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+        return _mm256_set1_epi16(n);
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_set1_epi16(n);
+}
+
+template<typename T>
+inline T _mm_set1_epi32_v(std::uint32_t n) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+        return _mm512_set1_epi32(n);
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+        return _mm256_set1_epi32(n);
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_set1_epi32(n);
+}
+
+template<typename T>
+inline T _mm_packus_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+        return _mm512_packus_epi16(a, b);
+#else
+        static_assert(false, "_mm_packus_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_packus_epi16(a, b);
+#else
+        static_assert(false, "_mm_packus_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_packus_epi16(a, b);
+}
+
+template<typename T>
+inline T _mm_add_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+        return _mm512_add_epi16(a, b);
+#else
+        static_assert(false, "_mm_add_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_add_epi16(a, b);
+#else
+        static_assert(false, "_mm_add_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_add_epi16(a, b);
+}
+
+template<typename T>
+inline T _mm_add_epi32_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+        return _mm512_add_epi32(a, b);
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_add_epi32(a, b);
+#else
+        static_assert(false, "_mm_add_epi32_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_add_epi32(a, b);
+}
+
+template<typename T>
+inline T _mm_sub_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+        return _mm512_sub_epi16(a, b);
+#else
+        static_assert(false, "_mm_sub_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_sub_epi16(a, b);
+#else
+        static_assert(false, "_mm_sub_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_sub_epi16(a, b);
+}
+
+template<typename T>
+inline T _mm_sub_epi32_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+        return _mm512_sub_epi32(a, b);
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_sub_epi32(a, b);
+#else
+        static_assert(false, "_mm_sub_epi32_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_sub_epi32(a, b);
+}
+
+template<typename T>
+inline T _mm_mulhi_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+        return _mm512_mulhi_epi16(a, b);
+#else
+        static_assert(false, "vmulhi_16<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_mulhi_epi16(a, b);
+#else
+        static_assert(false, "vmulhi_16<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_mulhi_epi16(a, b);
+}
+
+template<typename T>
+inline T _mm_slli_epi16_v(T a, int n) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+        return _mm512_slli_epi16(a, n);
+#else
+        static_assert(false, "_mm_slli_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_slli_epi16(a, n);
+#else
+        static_assert(false, "_mm_slli_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_slli_epi16(a, n);
+}
+
+template<typename T>
+inline T _mm_max_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+        return _mm512_max_epi16(a, b);
+#else
+        static_assert(false, "_mm_max_epi16_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_max_epi16(a, b);
+#else
+        static_assert(false, "_mm_max_epi16_v<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_max_epi16(a, b);
+}
+
+template<typename T>
+inline T _mm_min_epi16_v(T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+#ifdef __AVX512BW__
+        return _mm512_min_epi16(a, b);
+#else
+        static_assert(false, "vmin_16<__m512i> is not allowed without AVX-512 BW.");
+#endif
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+#ifdef __AVX2__
+        return _mm256_min_epi16(a, b);
+#else
+        static_assert(false, "vmin_16<__m256i> is not allowed without AVX2.");
+#endif
+#endif
+
+    if constexpr (sizeof(T) == 16)
+        return _mm_min_epi16(a, b);
+}
+
+template<typename T>
+inline std::int32_t _mm_reduce_add_epi32_v(T a) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+        return _mm512_reduce_add_epi32(a);
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+    {
+        __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
+        sum         = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_PERM_BADC));
+        sum         = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_PERM_CDAB));
+        return _mm_cvtsi128_si32(sum);
+    }
+#endif
+
+    if constexpr (sizeof(T) == 16)
+    {
+        a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0x4E));  // _MM_PERM_BADC
+        a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0xB1));  // _MM_PERM_CDAB
+        return _mm_cvtsi128_si32(a);
+    }
+}
+
+// Non-VNNI implementation of dpbusd works even with type saturation, only
+// because output values are clamped in ReLU layers immediately after
+// AffineTransform layer. Do not use this without VNNI for general purpose.
+template<typename T>
+inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) {
+    static_assert(is_valid_vector_v<T>);
+
+#ifdef __AVX512F__
+    if constexpr (sizeof(T) == 64)
+    {
+#if defined(__AVX512VNNI__)
+
+        acc = _mm512_dpbusd_epi32(acc, a, b);
+
+#elif defined(__AVX512BW__)
+
+        __m512i product = _mm512_maddubs_epi16(a, b);
+        product         = _mm512_madd_epi16(product, _mm512_set1_epi16(1));
+        acc             = _mm512_add_epi32(acc, product);
+
+#else
+        static_assert(false, "_mm_dpbusd_epi32_v<__m512i> is not allowed without AVX-512 BW.");
+#endif
+    }
+#endif
+
+#ifdef __AVX__
+    if constexpr (sizeof(T) == 32)
+    {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+
+        acc = _mm256_dpbusd_epi32(acc, a, b);
+
+#elif defined(__AVX2__)
+
+        __m256i product = _mm256_madd_epi16(_mm256_maddubs_epi16(a, b), _mm256_set1_epi16(1));
+        acc             = _mm256_add_epi32(acc, product);
+
+#else
+        static_assert(false, "_mm_dpbusd_epi32_v<__m256i> is not allowed without AVX2.");
+#endif
+    }
+#endif
+
+    if constexpr (sizeof(T) == 16)
+    {
+#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__)
+
+        acc = _mm_dpbusd_epi32(acc, a, b);
+
+#elif defined(__SSSE3__)
+
+        __m128i product = _mm_madd_epi16(_mm_maddubs_epi16(a, b), _mm_set1_epi16(1));
+        acc             = _mm_add_epi32(acc, product);
+
+#else
+
+        __m128i a0       = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+        __m128i a1       = _mm_unpackhi_epi8(a, _mm_setzero_si128());
+        __m128i sgn      = _mm_cmplt_epi8(b, _mm_setzero_si128());
+        __m128i b0       = _mm_unpacklo_epi8(b, sgn);
+        __m128i b1       = _mm_unpackhi_epi8(b, sgn);
+        __m128i product0 = _mm_madd_epi16(a0, b0);
+        __m128i product1 = _mm_madd_epi16(a1, b1);
+        __m128i product  = _mm_madd_epi16(_mm_packs_epi32(product0, product1), _mm_set1_epi16(1));
+        acc              = _mm_add_epi32(acc, product);
+
+#endif
+    }
+}
+
+#endif  // __SSE2__
+
+}  // namespace Stockfish
+
+#endif  // I386_ARCH_H_INCLUDED
diff --git a/src/arch/i386/nnue/layers/affine_transform.h b/src/arch/i386/nnue/layers/affine_transform.h
new file mode 100644
index 00000000000..9155089a16a
--- /dev/null
+++ b/src/arch/i386/nnue/layers/affine_transform.h
@@ -0,0 +1,118 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#define I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check x86/AMD64 SIMD extensions.
+// If none is defined, fall back to the generic implementation.
+#ifndef __SSE2__
+
+#include "arch/generic/nnue/layers/affine_transform.h"
+
+#else
+
+#include "../../arch.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template<IndexType InDims, IndexType OutDims>
+constexpr IndexType AffineTransform<InDims, OutDims>::get_weight_index(IndexType i) {
+    return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4
+         + i / PaddedInputDimensions * 4 + i % 4;
+}
+
+template<IndexType InDims, IndexType OutDims>
+void AffineTransform<InDims, OutDims>::propagate(const InputType* input, OutputType* output) const {
+    if constexpr (OutputDimensions > 1)
+    {
+#if defined(__AVX512F__) && (defined(__AVX512BW__) || defined(__AVX512VNNI__)) \
+  && !defined(NO_AVX512)
+        using vec_t = __m512i;
+#elif defined(__AVX2__)
+        using vec_t = __m256i;
+#else
+        using vec_t = __m128i;
+#endif
+
+        static constexpr IndexType OutputLanes = sizeof(vec_t) / sizeof(OutputType);
+        static_assert(OutputDimensions % OutputLanes == 0);
+
+        static constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
+        static constexpr IndexType NumRegs   = OutputDimensions / OutputLanes;
+
+        vec_t acc[NumRegs];
+
+        for (std::size_t k = 0; k < array_size(acc); ++k)
+            acc[k] = reinterpret_cast<const vec_t*>(biases)[k];
+
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            const vec_t in =
+              _mm_set1_epi32_v<vec_t>(reinterpret_cast<const std::uint32_t*>(input)[i]);
+            const auto col = reinterpret_cast<const vec_t*>(&weights[i * OutputDimensions * 4]);
+
+            for (std::size_t k = 0; k < array_size(acc); ++k)
+                _mm_dpbusd_epi32_v(acc[k], in, col[k]);
+        }
+
+        for (std::size_t k = 0; k < array_size(acc); ++k)
+            reinterpret_cast<vec_t*>(output)[k] = acc[k];
+    }
+    else if constexpr (OutputDimensions == 1)
+    {
+        // We cannot use AVX512 for the last layer because there are only
+        // 32 inputs and the buffer is not padded to 64 elements.
+#if defined(__AVX2__)
+        using vec_t = __m256i;
+#else
+        using vec_t = __m128i;
+#endif
+
+        static constexpr IndexType InputLanes = sizeof(vec_t) / sizeof(InputType);
+        static_assert(PaddedInputDimensions % InputLanes == 0);
+
+        static constexpr IndexType NumChunks = PaddedInputDimensions / InputLanes;
+
+        vec_t sum = _mm_setzero_v<vec_t>();
+
+        for (IndexType j = 0; j < NumChunks; ++j)
+        {
+            const vec_t in  = reinterpret_cast<const vec_t*>(input)[j];
+            const vec_t row = reinterpret_cast<const vec_t*>(weights)[j];
+            _mm_dpbusd_epi32_v(sum, in, row);
+        }
+
+        output[0] = _mm_reduce_add_epi32_v(sum) + biases[0];
+    }
+}
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // !__SSE2__
+
+#endif  // I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/arch/i386/nnue/layers/affine_transform_sparse_input.h b/src/arch/i386/nnue/layers/affine_transform_sparse_input.h
new file mode 100644
index 00000000000..dbc20f704ad
--- /dev/null
+++ b/src/arch/i386/nnue/layers/affine_transform_sparse_input.h
@@ -0,0 +1,168 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+#define I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
+
+#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+#include "../../arch.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+#ifdef __SSSE3__
+
+alignas(CacheLineSize) static const std::array<std::array<std::uint16_t, 8>, 256> lookupIndices =
+  [] {
+      std::array<std::array<std::uint16_t, 8>, 256> array{};
+      for (std::uint64_t i = 0; i < 256; ++i)
+      {
+          std::uint64_t j = i, k = 0;
+          while (j)
+              array[i][k++] = tzcnt(j), j = blsr(j);
+      }
+      return array;
+  }();
+
+template<IndexType InDims, IndexType OutDims>
+class AffineTransformSparseInput: public AffineTransform<InDims, OutDims> {
+    __DEFINE_BASE_PROPERTIES
+
+    static_assert(OutputDimensions % 16 == 0,
+                  "OutputDimensions must be multiple of 16 for this layer.");
+
+   public:
+    void propagate(const InputType* input, OutputType* output) const;
+
+   private:
+    template<IndexType InputDimensions>
+    static IndexType populate_nz_indices(const std::uint8_t* input, std::uint16_t* indices) {
+#if defined(__AVX512F__) && !defined(NO_AVX512)
+        using vec_t = __m512i;
+#elif defined(__AVX__)
+        using vec_t = __m256i;
+#else
+        using vec_t = __m128i;
+#endif
+
+        static constexpr IndexType InputLanes = sizeof(vec_t) / 4;
+
+        // Inputs are processed InputLanes at a time and outputs are processed
+        // 8 at a time so we process in chunks of max(InputLanes, 8).
+        static constexpr IndexType ChunkSize       = std::max<IndexType>(InputLanes, 8);
+        static constexpr IndexType NumChunks       = InputDimensions / ChunkSize;
+        static constexpr IndexType InputsPerChunk  = ChunkSize / InputLanes;
+        static constexpr IndexType OutputsPerChunk = ChunkSize / 8;
+
+        IndexType count = 0;
+        __m128i   base  = _mm_setzero_si128();
+
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            std::uint32_t nnz = 0;
+            for (IndexType j = 0; j < InputsPerChunk; ++j)
+            {
+                const vec_t chunk = reinterpret_cast<const vec_t*>(input)[i * InputsPerChunk + j];
+
+                // Since all 32-bit blocks are non-negative, it is safe to use cmpgt
+                // if the target architecture does not support cmpneq.
+#if defined(__AVX512F__) && !defined(NO_AVX512)
+                const std::uint32_t mask = _mm512_cmpneq_epi32_mask(chunk, _mm512_setzero_si512());
+#elif defined(__AVX2__)
+                const std::uint32_t mask = _mm256_movemask_ps(
+                  _mm256_castsi256_ps(_mm256_cmpgt_epi32(chunk, _mm256_setzero_si256())));
+#elif defined(__AVX__)
+                const std::uint32_t mask = _mm256_movemask_ps(
+                  _mm256_cmp_ps(_mm256_castsi256_ps(chunk), _mm256_setzero_ps(), _CMP_NEQ_UQ));
+#else
+                const std::uint32_t mask =
+                  _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(chunk, _mm_setzero_si128())));
+#endif
+
+                nnz |= mask << (j * InputLanes);
+            }
+            for (IndexType j = 0; j < OutputsPerChunk; ++j)
+            {
+                const std::uint8_t lookup = (nnz >> (j * 8)) & 0xFF;
+                const __m128i offsets = *reinterpret_cast<const __m128i*>(&lookupIndices[lookup]);
+                _mm_storeu_si128(reinterpret_cast<__m128i_u*>(indices + count),
+                                 _mm_add_epi16(base, offsets));
+                count += popcount(lookup);
+                base = _mm_add_epi16(base, _mm_set1_epi16(8));
+            }
+        }
+
+        return count;
+    }
+};
+
+template<IndexType InDims, IndexType OutDims>
+void AffineTransformSparseInput<InDims, OutDims>::propagate(const InputType* input,
+                                                            OutputType*      output) const {
+#if defined(__AVX512F__) && (defined(__AVX512BW__) || defined(__AVX512VNNI__)) \
+  && !defined(NO_AVX512)
+    using vec_t = __m512i;
+#elif defined(__AVX2__)
+    using vec_t = __m256i;
+#else
+    using vec_t = __m128i;
+#endif
+
+    static constexpr IndexType OutputLanes = sizeof(vec_t) / sizeof(OutputType);
+
+    static constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
+    static constexpr IndexType NumRegs   = OutputDimensions / OutputLanes;
+
+    vec_t         acc[NumRegs];
+    std::uint16_t nnz[NumChunks];
+    IndexType     count = populate_nz_indices<NumChunks>(input, nnz);
+
+    for (std::size_t k = 0; k < array_size(acc); ++k)
+        acc[k] = reinterpret_cast<const vec_t*>(biases)[k];
+
+    for (IndexType j = 0; j < count; ++j)
+    {
+        const auto  i   = nnz[j];
+        const vec_t in  = _mm_set1_epi32_v<vec_t>(reinterpret_cast<const std::uint32_t*>(input)[i]);
+        const auto  col = reinterpret_cast<const vec_t*>(&weights[i * OutputDimensions * 4]);
+        for (std::size_t k = 0; k < array_size(acc); ++k)
+            _mm_dpbusd_epi32_v(acc[k], in, col[k]);
+    }
+
+    for (std::size_t k = 0; k < array_size(acc); ++k)
+        reinterpret_cast<vec_t*>(output)[k] = acc[k];
+}
+
+#else
+
+template<IndexType InDims, IndexType OutDims>
+using AffineTransformSparseInput = AffineTransform<InDims, OutDims>;
+
+#endif  // __SSSE3__
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
diff --git a/src/arch/i386/nnue/layers/clipped_relu.h b/src/arch/i386/nnue/layers/clipped_relu.h
new file mode 100644
index 00000000000..187d77554f0
--- /dev/null
+++ b/src/arch/i386/nnue/layers/clipped_relu.h
@@ -0,0 +1,133 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#define I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check x86/AMD64 SIMD extensions.
+// If none is defined, fall back to the generic implementation.
+#ifndef __SSE2__
+
+#include "arch/generic/nnue/layers/clipped_relu.h"
+
+#else
+
+#include "../../arch.h"
+
+#include <cstdint>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template<IndexType InDims>
+void ClippedReLU<InDims>::propagate(const InputType* input, OutputType* output) const {
+    static_assert(PaddedOutputDimensions % 32 == 0);
+
+    // Do not use 256-bit registers on AVX as it does not have shift
+    // instructions, instead fall back to SSE4.1.
+#ifdef __AVX2__
+
+#if defined(__AVX512F__) && defined(__AVX512BW__) && !defined(NO_AVX512)
+    if constexpr (PaddedOutputDimensions >= 64)
+    {
+        static constexpr IndexType NumChunks = PaddedOutputDimensions / 64;
+
+        static const __m512i permuteTable512 =
+          _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+        const auto in  = reinterpret_cast<const __m512i*>(input);
+        const auto out = reinterpret_cast<__m512i*>(output);
+
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            const __m512i words0 =
+              _mm512_srli_epi16(_mm512_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits);
+            const __m512i words1 =
+              _mm512_srli_epi16(_mm512_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits);
+
+            out[i] = _mm512_permutexvar_epi32(permuteTable512, _mm512_packs_epi16(words0, words1));
+        }
+    }
+    constexpr IndexType Start = PaddedOutputDimensions / 64 * 64;
+#else
+    constexpr IndexType Start = 0;
+#endif
+
+    if constexpr (Start != PaddedOutputDimensions)
+    {
+        static constexpr IndexType NumChunks = PaddedOutputDimensions / 32;
+
+        static const __m256i permuteTable256 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+
+        const auto in  = reinterpret_cast<const __m256i*>(input);
+        const auto out = reinterpret_cast<__m256i*>(output);
+
+        for (IndexType i = Start / 32; i < NumChunks; ++i)
+        {
+            const __m256i words0 =
+              _mm256_srli_epi16(_mm256_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits);
+            const __m256i words1 =
+              _mm256_srli_epi16(_mm256_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits);
+
+            out[i] =
+              _mm256_permutevar8x32_epi32(_mm256_packs_epi16(words0, words1), permuteTable256);
+        }
+    }
+
+#else  // __SSE2__
+
+    static constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(OutputDimensions, 16) / 16;
+
+    const auto in  = reinterpret_cast<const __m128i*>(input);
+    const auto out = reinterpret_cast<__m128i*>(output);
+
+    for (IndexType i = 0; i < NumChunks; ++i)
+    {
+#ifdef __SSE4_1__
+        const __m128i words0 =
+          _mm_srli_epi16(_mm_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits);
+        const __m128i words1 =
+          _mm_srli_epi16(_mm_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits);
+
+        out[i] = _mm_packs_epi16(words0, words1);
+#else
+        static const __m128i s8min = _mm_set1_epi8(-0x80);
+
+        const __m128i words0 =
+          _mm_srai_epi16(_mm_packs_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits);
+        const __m128i words1 =
+          _mm_srai_epi16(_mm_packs_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits);
+        const __m128i bytes = _mm_packs_epi16(words0, words1);
+
+        out[i] = _mm_subs_epi8(_mm_adds_epi8(bytes, s8min), s8min);
+#endif
+    }
+
+#endif
+}
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // !__SSE2__
+
+#endif  // I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/i386/nnue/layers/sqr_clipped_relu.h b/src/arch/i386/nnue/layers/sqr_clipped_relu.h
new file mode 100644
index 00000000000..bdab25693d0
--- /dev/null
+++ b/src/arch/i386/nnue/layers/sqr_clipped_relu.h
@@ -0,0 +1,68 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#define I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+
+#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check x86/AMD64 SIMD extensions.
+// If none is defined, fall back to the generic implementation.
+#ifndef __SSE2__
+
+#include "arch/generic/nnue/layers/sqr_clipped_relu.h"
+
+#else
+
+#include "../../arch.h"
+
+#include <cstdint>
+
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE::Layers {
+
+template<IndexType InDims>
+void SqrClippedReLU<InDims>::propagate(const InputType* input, OutputType* output) const {
+    static constexpr IndexType NumChunks = PaddedOutputDimensions / 16;
+
+    const auto in  = reinterpret_cast<const __m128i*>(input);
+    const auto out = reinterpret_cast<__m128i*>(output);
+
+    for (IndexType i = 0; i < NumChunks; ++i)
+    {
+        __m128i words0 = _mm_packs_epi32(in[i * 4 + 0], in[i * 4 + 1]);
+        __m128i words1 = _mm_packs_epi32(in[i * 4 + 2], in[i * 4 + 3]);
+
+        // We shift by WeightScaleBits * 2 = 12 and divide by 128 which is
+        // an additional shift-right of 7, meaning 19 in total. Mulhi
+        // strips the lower 16 bits so we need to shift by 3 more.
+        words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
+        words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
+
+        out[i] = _mm_packs_epi16(words0, words1);
+    }
+}
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#endif  // !__SSE2__
+
+#endif  // I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
diff --git a/src/arch/i386/nnue/nnue_feature_transformer.h b/src/arch/i386/nnue/nnue_feature_transformer.h
new file mode 100644
index 00000000000..a6d321f465c
--- /dev/null
+++ b/src/arch/i386/nnue/nnue_feature_transformer.h
@@ -0,0 +1,435 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef I386_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#define I386_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+
+#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#error "Never use architecture specific header files directly."
+#endif
+
+// Check x86/AMD64 SIMD extensions.
+// If none is defined, fall back to the generic implementation.
+#ifndef __SSE2__
+
+#include "arch/generic/nnue/nnue_feature_transformer.h"
+
+#else
+
+#include "../arch.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "misc.h"
+#include "position.h"
+#include "types.h"
+#include "nnue/nnue_accumulator.h"
+#include "nnue/nnue_common.h"
+
+namespace Stockfish::Eval::NNUE {
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+struct FeatureTransformer<TransformedFeatureDimensions, accPtr>::Details {
+#if defined(__AVX512F__) && defined(__AVX512BW__) && !defined(NO_AVX512)
+    // The size of the current PSQT weights array is too small for AVX-512.
+    using vec_t      = __m512i;
+    using psqt_vec_t = __m256i;
+#elif defined(__AVX2__)
+    using vec_t      = __m256i;
+    using psqt_vec_t = __m256i;
+#else
+    using vec_t      = __m128i;
+    using psqt_vec_t = __m128i;
+#endif
+
+   private:
+#if defined(__AVX512F__)
+    // EVEX enconding scheme, but uses 16 only. Need to check <=32
+    static constexpr int NumXMM = 16;
+#else
+    static constexpr int NumXMM = is_64bit() ? 16 : 8;
+#endif
+
+   public:
+    static constexpr std::size_t AccRegisterSize  = sizeof(vec_t);
+    static constexpr std::size_t PSQTRegisterSize = sizeof(psqt_vec_t);
+
+    static constexpr int OptimalAccRegisterCount =
+      optimal_register_count<AccRegisterSize,
+                             NumXMM,
+                             sizeof(WeightType),
+                             TransformedFeatureDimensions>();
+    static constexpr int OptimalPSQTRegisterCount =
+      optimal_register_count<PSQTRegisterSize, NumXMM, sizeof(PSQTWeightType), PSQTBuckets>();
+
+    static constexpr IndexType TileHeight =
+      OptimalAccRegisterCount * AccRegisterSize / sizeof(WeightType);
+    static constexpr IndexType PsqtTileHeight =
+      OptimalPSQTRegisterCount * PSQTRegisterSize / sizeof(PSQTWeightType);
+
+    static_assert(HalfDimensions % TileHeight == 0,
+                  "HalfDimensions must be multiple of TileHeight");
+    static_assert(PSQTBuckets % PsqtTileHeight == 0,
+                  "PSQTBuckets must be multiple of PsqtTileHeight");
+};
+
+template<std::size_t RegisterSize, bool Write>
+static inline constexpr void permute_pack(std::uint64_t* v) {
+    if constexpr (RegisterSize == 64)
+        if constexpr (Write)
+        {
+            std::uint64_t tmp0 = v[2], tmp1 = v[3];
+            v[2] = v[8], v[3] = v[9];
+            v[8] = v[4], v[9] = v[5];
+            v[4] = tmp0, v[5] = tmp1;
+            tmp0 = v[6], tmp1 = v[7];
+            v[6] = v[10], v[7] = v[11];
+            v[10] = v[12], v[11] = v[13];
+            v[12] = tmp0, v[13] = tmp1;
+        }
+        else
+        {
+            std::uint64_t tmp0 = v[2], tmp1 = v[3];
+            v[2] = v[4], v[3] = v[5];
+            v[4] = v[8], v[5] = v[9];
+            v[8] = tmp0, v[9] = tmp1;
+            tmp0 = v[6], tmp1 = v[7];
+            v[6] = v[12], v[7] = v[13];
+            v[12] = v[10], v[13] = v[11];
+            v[10] = tmp0, v[11] = tmp1;
+        }
+    else if constexpr (RegisterSize == 32)
+    {
+        std::swap(v[2], v[4]);
+        std::swap(v[3], v[5]);
+    }
+}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<bool Write>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::permute_weights() {
+    // The weight numbers are permuted preliminarily, due to the use of
+    // AVX2/AVX-512 pack intrinsics.
+    if constexpr (Details::AccRegisterSize >= 32)
+    {
+        constexpr IndexType Width = Details::AccRegisterSize == 64 ? 16 : 8;
+
+        for (IndexType i = 0; i < HalfDimensions * sizeof(BiasType) / 8; i += Width)
+            permute_pack<Details::AccRegisterSize, Write>(
+              &reinterpret_cast<std::uint64_t*>(biases)[i]);
+
+        for (IndexType j = 0; j < InputDimensions; ++j)
+            for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / 8; i += Width)
+                permute_pack<Details::AccRegisterSize, Write>(
+                  &reinterpret_cast<std::uint64_t*>(&weights[j * HalfDimensions])[i]);
+    }
+}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<Color Perspective, bool CurrentOnly>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::
+  apply_accumulator_updates_incremental(StateInfo*             computed,
+                                        StateInfo*             next,
+                                        FeatureSet::IndexList& removed,
+                                        FeatureSet::IndexList& added) const {
+    using vec_t      = typename Details::vec_t;
+    using psqt_vec_t = typename Details::psqt_vec_t;
+
+    // The most common case when updating the accumulator incrementally.
+    // Calculates feature differences directly without using tiling mechanism.
+    if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1)
+    {
+        const auto accIn =
+          reinterpret_cast<const vec_t*>(&(computed->*accPtr).accumulation[Perspective][0]);
+        const auto accOut = reinterpret_cast<vec_t*>(&(next->*accPtr).accumulation[Perspective][0]);
+
+        const IndexType offsetR0 = HalfDimensions * removed[0];
+        const auto      columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]);
+        const IndexType offsetA  = HalfDimensions * added[0];
+        const auto      columnA  = reinterpret_cast<const vec_t*>(&weights[offsetA]);
+
+        if (removed.size() == 1)
+        {
+            for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
+                accOut[i] = _mm_add_epi16_v(_mm_sub_epi16_v(accIn[i], columnR0[i]), columnA[i]);
+        }
+        else
+        {
+            const IndexType offsetR1 = HalfDimensions * removed[1];
+            const auto      columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]);
+
+            for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
+                accOut[i] = _mm_sub_epi16_v(_mm_add_epi16_v(accIn[i], columnA[i]),
+                                            _mm_add_epi16_v(columnR0[i], columnR1[i]));
+        }
+
+        const auto accPsqtIn = reinterpret_cast<const psqt_vec_t*>(
+          &(computed->*accPtr).psqtAccumulation[Perspective][0]);
+        const auto accPsqtOut =
+          reinterpret_cast<psqt_vec_t*>(&(next->*accPtr).psqtAccumulation[Perspective][0]);
+
+        const IndexType offsetPsqtR0 = PSQTBuckets * removed[0];
+        auto columnPsqtR0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR0]);
+        const IndexType offsetPsqtA = PSQTBuckets * added[0];
+        auto columnPsqtA = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtA]);
+
+        if (removed.size() == 1)
+        {
+            for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
+                 ++i)
+                accPsqtOut[i] =
+                  _mm_add_epi32_v(_mm_sub_epi32_v(accPsqtIn[i], columnPsqtR0[i]), columnPsqtA[i]);
+        }
+        else
+        {
+            const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
+            const auto      columnPsqtR1 =
+              reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR1]);
+
+            for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t);
+                 ++i)
+                accPsqtOut[i] = _mm_sub_epi32_v(_mm_add_epi32_v(accPsqtIn[i], columnPsqtA[i]),
+                                                _mm_add_epi32_v(columnPsqtR0[i], columnPsqtR1[i]));
+        }
+    }
+    else
+    {
+        vec_t acc[Details::OptimalAccRegisterCount];
+
+        for (IndexType i = 0; i < HalfDimensions / Details::TileHeight; ++i)
+        {
+            const IndexType offsetRow = i * Details::TileHeight;
+
+            const auto accTileIn = reinterpret_cast<const vec_t*>(
+              &(computed->*accPtr).accumulation[Perspective][offsetRow]);
+            for (std::size_t j = 0; j < array_size(acc); ++j)
+                acc[j] = accTileIn[j];
+
+            for (const auto index : removed)
+            {
+                const IndexType offset = HalfDimensions * index + offsetRow;
+                const auto      column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                for (std::size_t j = 0; j < array_size(acc); ++j)
+                    acc[j] = _mm_sub_epi16_v(acc[j], column[j]);
+            }
+
+            for (const auto index : added)
+            {
+                const IndexType offset = HalfDimensions * index + offsetRow;
+                const auto      column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                for (std::size_t j = 0; j < array_size(acc); ++j)
+                    acc[j] = _mm_add_epi16_v(acc[j], column[j]);
+            }
+
+            const auto accTileOut =
+              reinterpret_cast<vec_t*>(&(next->*accPtr).accumulation[Perspective][offsetRow]);
+            for (std::size_t j = 0; j < array_size(acc); ++j)
+                accTileOut[j] = acc[j];
+        }
+
+        psqt_vec_t psqt[Details::OptimalPSQTRegisterCount];
+
+        for (IndexType i = 0; i < PSQTBuckets / Details::PsqtTileHeight; ++i)
+        {
+            const IndexType offsetRow = i * Details::PsqtTileHeight;
+
+            auto accTilePsqtIn = reinterpret_cast<const psqt_vec_t*>(
+              &(computed->*accPtr).psqtAccumulation[Perspective][offsetRow]);
+            for (std::size_t j = 0; j < array_size(psqt); ++j)
+                psqt[j] = accTilePsqtIn[j];
+
+            for (const auto index : removed)
+            {
+                const IndexType offset = PSQTBuckets * index + offsetRow;
+                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+                for (std::size_t j = 0; j < array_size(psqt); ++j)
+                    psqt[j] = _mm_sub_epi32_v(psqt[j], columnPsqt[j]);
+            }
+
+            for (const auto index : added)
+            {
+                const IndexType offset = PSQTBuckets * index + offsetRow;
+                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+                for (std::size_t j = 0; j < array_size(psqt); ++j)
+                    psqt[j] = _mm_add_epi32_v(psqt[j], columnPsqt[j]);
+            }
+
+            auto accTilePsqtOut = reinterpret_cast<psqt_vec_t*>(
+              &(next->*accPtr).psqtAccumulation[Perspective][offsetRow]);
+            for (std::size_t j = 0; j < array_size(psqt); ++j)
+                accTilePsqtOut[j] = psqt[j];
+        }
+    }
+}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+template<Color Perspective>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::
+  apply_accumulator_updates_refresh_cache(
+    Accumulator<TransformedFeatureDimensions>&                accumulator,
+    typename AccumulatorCaches::Cache<HalfDimensions>::Entry& entry,
+    FeatureSet::IndexList                                     removed,
+    FeatureSet::IndexList                                     added) const {
+    using vec_t      = typename Details::vec_t;
+    using psqt_vec_t = typename Details::psqt_vec_t;
+
+    vec_t acc[Details::OptimalAccRegisterCount];
+
+    for (IndexType j = 0; j < HalfDimensions / Details::TileHeight; ++j)
+    {
+        const IndexType offsetRow = j * Details::TileHeight;
+
+        const auto accTile =
+          reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][offsetRow]);
+        const auto entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[offsetRow]);
+
+        for (IndexType k = 0; k < array_size(acc); ++k)
+            acc[k] = entryTile[k];
+
+        std::size_t i = 0;
+        for (; i < std::min(removed.size(), added.size()); ++i)
+        {
+            const IndexType offsetR = HalfDimensions * removed[i] + offsetRow;
+            const auto      columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]);
+            const IndexType offsetA = HalfDimensions * added[i] + offsetRow;
+            const auto      columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
+
+            for (std::size_t k = 0; k < array_size(acc); ++k)
+                acc[k] = _mm_add_epi16_v(acc[k], _mm_sub_epi16_v(columnA[k], columnR[k]));
+        }
+        for (; i < removed.size(); ++i)
+        {
+            const IndexType offset = HalfDimensions * removed[i] + offsetRow;
+            const auto      column = reinterpret_cast<const vec_t*>(&weights[offset]);
+
+            for (std::size_t k = 0; k < array_size(acc); ++k)
+                acc[k] = _mm_sub_epi16_v(acc[k], column[k]);
+        }
+        for (; i < added.size(); ++i)
+        {
+            const IndexType offset = HalfDimensions * added[i] + offsetRow;
+            const auto      column = reinterpret_cast<const vec_t*>(&weights[offset]);
+
+            for (std::size_t k = 0; k < array_size(acc); ++k)
+                acc[k] = _mm_add_epi16_v(acc[k], column[k]);
+        }
+
+        for (std::size_t k = 0; k < array_size(acc); k++)
+            entryTile[k] = acc[k];
+        for (std::size_t k = 0; k < array_size(acc); k++)
+            accTile[k] = acc[k];
+    }
+
+    psqt_vec_t psqt[Details::OptimalPSQTRegisterCount];
+
+    for (IndexType j = 0; j < PSQTBuckets / Details::PsqtTileHeight; ++j)
+    {
+        const IndexType offsetRow = j * Details::PsqtTileHeight;
+
+        const auto accTilePsqt =
+          reinterpret_cast<psqt_vec_t*>(&accumulator.psqtAccumulation[Perspective][offsetRow]);
+        const auto entryTilePsqt =
+          reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[offsetRow]);
+
+        for (std::size_t k = 0; k < array_size(psqt); ++k)
+            psqt[k] = entryTilePsqt[k];
+
+        for (std::size_t i = 0; i < removed.size(); ++i)
+        {
+            const IndexType offset     = PSQTBuckets * removed[i] + offsetRow;
+            const auto      columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+
+            for (std::size_t k = 0; k < array_size(psqt); ++k)
+                psqt[k] = _mm_sub_epi32_v(psqt[k], columnPsqt[k]);
+        }
+        for (std::size_t i = 0; i < added.size(); ++i)
+        {
+            const IndexType offset     = PSQTBuckets * added[i] + offsetRow;
+            const auto      columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+
+            for (std::size_t k = 0; k < array_size(psqt); ++k)
+                psqt[k] = _mm_add_epi32_v(psqt[k], columnPsqt[k]);
+        }
+
+        for (std::size_t k = 0; k < array_size(psqt); ++k)
+            entryTilePsqt[k] = psqt[k];
+        for (std::size_t k = 0; k < array_size(psqt); ++k)
+            accTilePsqt[k] = psqt[k];
+    }
+}
+
+template<IndexType                                 TransformedFeatureDimensions,
+         Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
+void FeatureTransformer<TransformedFeatureDimensions, accPtr>::convert_accumulators(
+  const Position& pos, OutputType* output) const {
+    using vec_t = typename Details::vec_t;
+
+    static constexpr IndexType OutputChunkSize = Details::AccRegisterSize / sizeof(OutputType);
+    static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+
+    static constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
+
+    const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
+    const auto& accumulation    = (pos.state()->*accPtr).accumulation;
+
+    for (IndexType p = 0; p < 2; ++p)
+    {
+        const auto in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
+        const auto in1 =
+          reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+        const auto out = reinterpret_cast<vec_t*>(&output[(HalfDimensions / 2) * p]);
+
+        for (IndexType j = 0; j < NumOutputChunks; ++j)
+        {
+            // What we want to do is multiply inputs in a pairwise manner
+            // (after clipping), and then shift right by 9. Instead, we
+            // shift left by 7, and use mulhi, stripping the bottom 16 bits,
+            // effectively shifting right by 16, resulting in a net shift
+            // of 9 bits. We use mulhi because it maintains the sign of
+            // the multiplication (unlike mullo), allowing us to make use
+            // of packus to clip 2 of the inputs, resulting in a save of 2
+            // "_mm_max_epi16_v" calls.
+
+            static const vec_t Zeroes = _mm_setzero_v<vec_t>();
+            static const vec_t Ones   = _mm_set1_epi16_v<vec_t>(127 * 2);
+
+            const vec_t sum0a =
+              _mm_slli_epi16_v(_mm_max_epi16_v(_mm_min_epi16_v(in0[j * 2 + 0], Ones), Zeroes), 7);
+            const vec_t sum0b =
+              _mm_slli_epi16_v(_mm_max_epi16_v(_mm_min_epi16_v(in0[j * 2 + 1], Ones), Zeroes), 7);
+            const vec_t sum1a = _mm_min_epi16_v(in1[j * 2 + 0], Ones);
+            const vec_t sum1b = _mm_min_epi16_v(in1[j * 2 + 1], Ones);
+
+            const vec_t pa = _mm_mulhi_epi16_v(sum0a, sum1a);
+            const vec_t pb = _mm_mulhi_epi16_v(sum0b, sum1b);
+
+            out[j] = _mm_packus_epi16_v(pa, pb);
+        }
+    }
+}
+
+}  // namespace Stockfish::Eval::NNUE
+
+#endif  // !__SSE2__
+
+#endif  // I386_NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/bitboard.cpp b/src/bitboard.cpp
index a8b4e5f4464..5545e5133e1 100644
--- a/src/bitboard.cpp
+++ b/src/bitboard.cpp
@@ -16,17 +16,16 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "bitboard.h"
 
 #include <algorithm>
 #include <bitset>
 #include <initializer_list>
 
+#include "bitboard.h"
 #include "misc.h"
 
 namespace Stockfish {
 
-uint8_t PopCnt16[1 << 16];
 uint8_t SquareDistance[SQUARE_NB][SQUARE_NB];
 
 Bitboard LineBB[SQUARE_NB][SQUARE_NB];
@@ -50,34 +49,13 @@ Bitboard safe_destination(Square s, int step) {
     Square to = Square(s + step);
     return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0);
 }
-}
-
-// Returns an ASCII representation of a bitboard suitable
-// to be printed to standard output. Useful for debugging.
-std::string Bitboards::pretty(Bitboard b) {
-
-    std::string s = "+---+---+---+---+---+---+---+---+\n";
-
-    for (Rank r = RANK_8; r >= RANK_1; --r)
-    {
-        for (File f = FILE_A; f <= FILE_H; ++f)
-            s += b & make_square(f, r) ? "| X " : "|   ";
-
-        s += "| " + std::to_string(1 + r) + "\n+---+---+---+---+---+---+---+---+\n";
-    }
-    s += "  a   b   c   d   e   f   g   h\n";
-
-    return s;
-}
 
+}  // namespace
 
 // Initializes various bitboard tables. It is called at
 // startup and relies on global objects to be already zero-initialized.
 void Bitboards::init() {
 
-    for (unsigned i = 0; i < (1 << 16); ++i)
-        PopCnt16[i] = uint8_t(std::bitset<16>(i).count());
-
     for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1)
         for (Square s2 = SQ_A1; s2 <= SQ_H8; ++s2)
             SquareDistance[s1][s2] = std::max(distance<File>(s1, s2), distance<Rank>(s1, s2));
@@ -163,7 +141,7 @@ void init_magics(PieceType pt, Bitboard table[], Magic magics[]) {
         // apply to the 64 or 32 bits word to get the index.
         Magic& m = magics[s];
         m.mask   = sliding_attack(pt, s, 0) & ~edges;
-        m.shift  = (Is64Bit ? 64 : 32) - popcount(m.mask);
+        m.shift  = (is_64bit() ? 64 : 32) - popcount(m.mask);
 
         // Set the offset for the attacks table of the square. We have individual
         // table sizes for each square with "Fancy Magic Bitboards".
@@ -177,17 +155,17 @@ void init_magics(PieceType pt, Bitboard table[], Magic magics[]) {
             occupancy[size] = b;
             reference[size] = sliding_attack(pt, s, b);
 
-            if (HasPext)
+            if constexpr (use_pext())
                 m.attacks[pext(b, m.mask)] = reference[size];
 
             size++;
             b = (b - m.mask) & m.mask;
         } while (b);
 
-        if (HasPext)
+        if constexpr (use_pext())
             continue;
 
-        PRNG rng(seeds[Is64Bit][rank_of(s)]);
+        PRNG rng(seeds[is_64bit()][rank_of(s)]);
 
         // Find a magic for square 's' picking up an (almost) random number
         // until we find the one that passes the verification test.
diff --git a/src/bitboard.h b/src/bitboard.h
index cdff4c759bc..d75ec420d85 100644
--- a/src/bitboard.h
+++ b/src/bitboard.h
@@ -26,14 +26,14 @@
 #include <cstdlib>
 #include <string>
 
+#include "common.h"
 #include "types.h"
 
 namespace Stockfish {
 
 namespace Bitboards {
 
-void        init();
-std::string pretty(Bitboard b);
+void init();
 
 }  // namespace Stockfish::Bitboards
 
@@ -55,7 +55,6 @@ constexpr Bitboard Rank6BB = Rank1BB << (8 * 5);
 constexpr Bitboard Rank7BB = Rank1BB << (8 * 6);
 constexpr Bitboard Rank8BB = Rank1BB << (8 * 7);
 
-extern uint8_t PopCnt16[1 << 16];
 extern uint8_t SquareDistance[SQUARE_NB][SQUARE_NB];
 
 extern Bitboard BetweenBB[SQUARE_NB][SQUARE_NB];
@@ -73,11 +72,10 @@ struct Magic {
 
     // Compute the attack's index using the 'magic bitboards' approach
     unsigned index(Bitboard occupied) const {
-
-        if (HasPext)
+        if constexpr (use_pext())
             return unsigned(pext(occupied, mask));
 
-        if (Is64Bit)
+        if constexpr (is_64bit())
             return unsigned(((occupied & mask) * magic) >> shift);
 
         unsigned lo = unsigned(occupied) & unsigned(mask);
@@ -259,29 +257,6 @@ inline Bitboard attacks_bb(PieceType pt, Square s, Bitboard occupied) {
     }
 }
 
-
-// Counts the number of non-zero bits in a bitboard.
-inline int popcount(Bitboard b) {
-
-#ifndef USE_POPCNT
-
-    union {
-        Bitboard bb;
-        uint16_t u[4];
-    } v = {b};
-    return PopCnt16[v.u[0]] + PopCnt16[v.u[1]] + PopCnt16[v.u[2]] + PopCnt16[v.u[3]];
-
-#elif defined(_MSC_VER)
-
-    return int(_mm_popcnt_u64(b));
-
-#else  // Assumed gcc or compatible compiler
-
-    return __builtin_popcountll(b);
-
-#endif
-}
-
 // Returns the least significant bit in a non-zero bitboard.
 inline Square lsb(Bitboard b) {
     assert(b);
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 00000000000..eaf5c2ed551
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,205 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef COMMON_H_INCLUDED
+#define COMMON_H_INCLUDED
+
+#include <array>
+#include <bitset>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+// When compiling with provided Makefile (e.g. for Linux and OSX),
+// configuration is done automatically. To get started, type 'make help'.
+//
+// When Makefile is not used (e.g. with Microsoft Visual Studio), some macros
+// need to be pre-defined manually:
+//
+// NDEBUG       Disable debugging mode. Always use this for release.
+//
+// __SSE__      Generate x86 prefetch instruction.
+// __SSE2__     Generate x86 SSE2 SIMD instructions.
+// __SSSE3__    Generate x86 SSSE3 SIMD instructions.
+// __SSE4_1__   Generate x86 SSE4.1 SIMD instructions.
+// __POPCNT__   Generate x86 POPCNT instruction.
+// __PRFCHW__   Generate x86 PREFETCHW instruction. (not used currently)
+// __AVX__      Generate x86 AVX SIMD instructions.
+// __BMI__      Generate x86 BLSR and TZCNT instructions.
+// __AVX2__     Generate x86 AVX2 SIMD instructions.
+// __BMI2__     Generate x86 PEXT instruction.
+// __AVX512F__  Generate x86 AVX-512 SIMD instructions.
+// __AVX512BW__         ...
+// __AVX512VL__         ...
+// __AVX512VNNI__       ...
+
+#define STOCKFISH_COMPILER_UNKNOWN 0
+#define STOCKFISH_COMPILER_GCC 1
+#define STOCKFISH_COMPILER_CLANG 2
+#define STOCKFISH_COMPILER_INTEL 3
+#define STOCKFISH_COMPILER_MSVC 4
+
+#if defined(__GNUC__)
+    #if defined(__INTEL_LLVM_COMPILER)
+        #define STOCKFISH_COMPILER STOCKFISH_COMPILER_INTEL
+    #elif defined(__clang__)
+        #define STOCKFISH_COMPILER STOCKFISH_COMPILER_CLANG
+    #else
+        #define STOCKFISH_COMPILER STOCKFISH_COMPILER_GCC
+    #endif
+#elif defined(_MSC_VER)
+    #define STOCKFISH_COMPILER STOCKFISH_COMPILER_MSVC
+#else
+    #define STOCKFISH_COMPILER STOCKFISH_COMPILER_UNKNOWN
+#endif
+
+#if STOCKFISH_COMPILER == STOCKFISH_COMPILER_GCC
+    #if __GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)
+        #define ALIGNAS_ON_STACK_VARIABLES_BROKEN
+    #endif
+#elif STOCKFISH_COMPILER == STOCKFISH_COMPILER_MSVC
+    #pragma warning(disable: 4127)  // Conditional expression is constant
+    #pragma warning(disable: 4146)  // Unary minus operator applied to unsigned type
+    #pragma warning(disable: 4800)  // Forcing value to bool 'true' or 'false'
+
+    #if defined(_WIN64)
+        #include <intrin.h>
+    #endif
+#endif
+
+#define ASSERT_ALIGNED(ptr, alignment) \
+    assert(reinterpret_cast<std::uintptr_t>(ptr) % alignment == 0)
+
+namespace Stockfish {
+
+template<typename T, std::size_t N>
+constexpr std::size_t array_size(T (&)[N]) {
+    return N;
+}
+
+// Instead of using raw integer values, give each hint a comprehensive name.
+// Default is always -1, however, the actual value of it is defined in
+// implementation detail.
+enum class PrefetchHint;
+
+// This struct is used to provide generalized functionalities that might have
+// different implementations depending on the target architecture. Each
+// function defined in this struct is specialized in arch.h file respectively.
+struct ArchImpl {
+    /// Does the struct name fit to its purpose?
+
+    static const bool Is64Bit;
+
+    static const bool UsePEXT;  // used in Bitboard
+
+    // Clang apparently does not follow SFIANE in if constexpr statements,
+    // therefore annotate arguments with maybe_unused attribute to avoid
+    // warnings.
+
+    template<int Hint>
+    static inline void prefetch(const void* m);
+
+    template<typename T>
+    static inline unsigned int popcount(T n);
+
+    template<typename T>
+    static inline T pext(T n, T mask);
+};
+
+constexpr bool is_64bit() { return ArchImpl::Is64Bit; }
+
+constexpr bool use_pext() { return ArchImpl::UsePEXT; }
+
+// 64-bit builtin popcount is sometimes slower than using table, especially on
+// 32-bit environment. Therefore we provide two versions of it and leave it
+// for each platform to decide which one to use.
+template<typename T>
+inline int __popcount_use_table(T n) {
+    static_assert(std::is_integral_v<T> && sizeof(T) % 2 == 0);
+
+    static const std::array<std::uint8_t, 1 << 16> popcntTable = [] {
+        std::array<std::uint8_t, 1 << 16> table;
+        for (int i = 0; i < 1 << 16; ++i)
+            table[i] = std::uint8_t(std::bitset<16>(i).count());
+        return table;
+    }();
+
+    union {
+        T             raw;
+        std::uint16_t words[sizeof(T) / 2];
+    } v = {n};
+
+    int count = 0;
+    for (std::size_t i = 0; i < sizeof(T) / 2; ++i)
+        count += popcntTable[v.words[i]];
+
+    return count;
+}
+
+template<typename T>
+inline int __popcount_value(T n) {
+#ifdef __GNUC__
+    if constexpr (sizeof(T) == 8)
+        return __builtin_popcountll(std::uint64_t(n));
+    else
+        return __builtin_popcount(std::uint32_t(n));
+#else
+    if constexpr (sizeof(T) == 8)
+        return __popcount_value(std::uint32_t(n)) + __popcount_value(std::uint32_t(n >> 32));
+    else
+    {
+        n = n - ((n >> 1) & 0x55555555);
+        n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
+        return (((n + (n >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
+    }
+#endif
+}
+
+template<PrefetchHint Hint = static_cast<PrefetchHint>(-1)>
+inline void prefetch(const void* m) {
+    return ArchImpl::prefetch<static_cast<int>(Hint)>(m);
+}
+
+template<typename T>
+inline unsigned int popcount(T n) {
+    return ArchImpl::popcount(n);
+}
+
+template<typename T>
+inline T pext(T n, T mask) {
+    return ArchImpl::pext(n, mask);
+}
+
+}  // namespace Stockfish
+
+#if defined(__i386__) || defined(__amd64__)
+
+    #include "arch/i386/arch.h"
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+    #include "arch/arm/arch.h"
+
+#else
+
+    #include "arch/generic/arch.h"
+
+#endif
+
+#endif  // COMMON_H_INCLUDED
diff --git a/src/engine.cpp b/src/engine.cpp
index b5cc3f832f5..04d0be51b3a 100644
--- a/src/engine.cpp
+++ b/src/engine.cpp
@@ -28,6 +28,7 @@
 #include <utility>
 #include <vector>
 
+#include "common.h"
 #include "evaluate.h"
 #include "misc.h"
 #include "nnue/network.h"
@@ -45,7 +46,7 @@ namespace Stockfish {
 namespace NN = Eval::NNUE;
 
 constexpr auto StartFEN  = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
-constexpr int  MaxHashMB = Is64Bit ? 33554432 : 2048;
+constexpr int  MaxHashMB = is_64bit() ? 33554432 : 2048;
 
 Engine::Engine(std::optional<std::string> path) :
     binaryDirectory(path ? CommandLine::get_binary_directory(*path) : ""),
diff --git a/src/memory.h b/src/memory.h
index 3155a5aab12..f10bd6e56d9 100644
--- a/src/memory.h
+++ b/src/memory.h
@@ -27,7 +27,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "types.h"
+#include "common.h"
 
 namespace Stockfish {
 
diff --git a/src/misc.cpp b/src/misc.cpp
index 664ab4b89ff..39136606d94 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -16,8 +16,6 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-#include "misc.h"
-
 #include <atomic>
 #include <cctype>
 #include <cmath>
@@ -31,6 +29,8 @@
 #include <sstream>
 #include <string_view>
 
+#include "common.h"
+#include "misc.h"
 #include "types.h"
 
 namespace Stockfish {
@@ -159,119 +159,119 @@ std::string engine_info(bool to_uci) {
 
 // Returns a string trying to describe the compiler we use
 std::string compiler_info() {
+    std::stringstream ss;
 
-#define make_version_string(major, minor, patch) \
-    stringify(major) "." stringify(minor) "." stringify(patch)
-
-    // Predefined macros hell:
-    //
-    // __GNUC__                Compiler is GCC, Clang or ICX
-    // __clang__               Compiler is Clang or ICX
-    // __INTEL_LLVM_COMPILER   Compiler is ICX
-    // _MSC_VER                Compiler is MSVC
-    // _WIN32                  Building on Windows (any)
-    // _WIN64                  Building on Windows 64 bit
-
-    std::string compiler = "\nCompiled by                : ";
+    ss << "Compiler        : ";
 
 #if defined(__INTEL_LLVM_COMPILER)
-    compiler += "ICX ";
-    compiler += stringify(__INTEL_LLVM_COMPILER);
+    ss << "ICX ";
+    ss << stringify(__VERSION);
 #elif defined(__clang__)
-    compiler += "clang++ ";
-    compiler += make_version_string(__clang_major__, __clang_minor__, __clang_patchlevel__);
-#elif _MSC_VER
-    compiler += "MSVC ";
-    compiler += "(version ";
-    compiler += stringify(_MSC_FULL_VER) "." stringify(_MSC_BUILD);
-    compiler += ")";
+    ss << "Clang ";
+    ss << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__;
 #elif defined(__e2k__) && defined(__LCC__)
-    #define dot_ver2(n) \
-        compiler += char('.'); \
-        compiler += char('0' + (n) / 10); \
-        compiler += char('0' + (n) % 10);
-
-    compiler += "MCST LCC ";
-    compiler += "(version ";
-    compiler += std::to_string(__LCC__ / 100);
-    dot_ver2(__LCC__ % 100) dot_ver2(__LCC_MINOR__) compiler += ")";
-#elif __GNUC__
-    compiler += "g++ (GNUC) ";
-    compiler += make_version_string(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
+    ss << "MCST LCC ";
+    ss << (__LCC__ / 100) << '.' << std::setfill('0') << std::setw(2) << (__LCC__ % 100) << '.'
+       << __LCC_MINOR__;
+    ss.clear();
+#elif defined(__GNUC__)
+    ss << "g++ ";
+    ss << __GNUC__ << '.' << __GNUC_MINOR__ << '.' << __GNUC_PATCHLEVEL__;
+#elif defined(_MSC_VER)
+    ss << "MSVC ";
+    ss << _MSC_FULL_VER << '.' << _MSC_BUILD;
 #else
-    compiler += "Unknown compiler ";
-    compiler += "(unknown version)";
+    ss << "Unknown compiler";
 #endif
 
 #if defined(__APPLE__)
-    compiler += " on Apple";
+    ss << " on Apple";
 #elif defined(__CYGWIN__)
-    compiler += " on Cygwin";
-#elif defined(__MINGW64__)
-    compiler += " on MinGW64";
+    ss << " on Cygwin";
 #elif defined(__MINGW32__)
-    compiler += " on MinGW32";
-#elif defined(__ANDROID__)
-    compiler += " on Android";
+    ss << " on MinGW";
 #elif defined(__linux__)
-    compiler += " on Linux";
+    ss << " on Linux";
 #elif defined(_WIN64)
-    compiler += " on Microsoft Windows 64-bit";
+    ss << " on Microsoft Windows 64-bit";
 #elif defined(_WIN32)
-    compiler += " on Microsoft Windows 32-bit";
+    ss << " on Microsoft Windows 32-bit";
+#else
+    ss << " on unknown system";
+#endif
+
+    ss << "\n";
+    ss << "Build type      : ";
+
+#ifdef NDEBUG
+    ss << "Release";
 #else
-    compiler += " on unknown system";
+    ss << "Debug";
 #endif
 
-    compiler += "\nCompilation architecture   : ";
-#if defined(ARCH)
-    compiler += stringify(ARCH);
+    ss << "\n";
+    ss << "Build profile   : ";
+
+#ifdef ARCH
+    ss << stringify(ARCH);
 #else
-    compiler += "(undefined architecture)";
+    ss << "unknown";
 #endif
 
-    compiler += "\nCompilation settings       : ";
-    compiler += (Is64Bit ? "64bit" : "32bit");
-#if defined(USE_VNNI)
-    compiler += " VNNI";
+    ss << "\n";
+    ss << "Compile options : ";
+    ss << (is_64bit() ? "64bit" : "32bit");
+
+// x86/AMD64 family
+#ifdef __AVX512F__
+    ss << " AVX-512 (F";
+    #ifdef __AVX512BW__
+    ss << ",BW";
+    #endif
+    #ifdef __AVX512VL__
+    ss << ",VL";
+    #endif
+    #ifdef __AVX512VNNI__
+    ss << ",VNNI";
+    #endif
+    ss << ")";
 #endif
-#if defined(USE_AVX512)
-    compiler += " AVX512";
+#ifdef __BMI2__
+    ss << " BMI2";
 #endif
-    compiler += (HasPext ? " BMI2" : "");
-#if defined(USE_AVX2)
-    compiler += " AVX2";
+#ifdef __AVX2__
+    ss << " AVX2";
 #endif
-#if defined(USE_SSE41)
-    compiler += " SSE41";
+#ifdef __BMI__
+    ss << " BMI";
 #endif
-#if defined(USE_SSSE3)
-    compiler += " SSSE3";
+#ifdef __AVX__
+    ss << " AVX";
 #endif
-#if defined(USE_SSE2)
-    compiler += " SSE2";
+#ifdef __POPCNT__
+    ss << " POPCNT";
 #endif
-    compiler += (HasPopCnt ? " POPCNT" : "");
-#if defined(USE_NEON_DOTPROD)
-    compiler += " NEON_DOTPROD";
-#elif defined(USE_NEON)
-    compiler += " NEON";
+#ifdef __SSE4_1__
+    ss << " SSE4.1";
 #endif
-
-#if !defined(NDEBUG)
-    compiler += " DEBUG";
+#ifdef __SSSE3__
+    ss << " SSSE3";
+#endif
+#ifdef __SSE2__
+    ss << " SSE2";
 #endif
 
-    compiler += "\nCompiler __VERSION__ macro : ";
-#ifdef __VERSION__
-    compiler += __VERSION__;
-#else
-    compiler += "(undefined macro)";
+// ARM/AArch64 family
+#ifdef __ARM_FEATURE_DOTPROD
+    ss << " DotProd";
+#endif
+#ifdef __ARM_NEON
+    ss << " Neon";
 #endif
 
-    compiler += "\n";
+    ss << "\n";
 
-    return compiler;
+    return ss.str();
 }
 
 
@@ -407,24 +407,6 @@ void sync_cout_end() { std::cout << IO_UNLOCK; }
 // Trampoline helper to avoid moving Logger to misc.h
 void start_logger(const std::string& fname) { Logger::start(fname); }
 
-
-#ifdef NO_PREFETCH
-
-void prefetch(const void*) {}
-
-#else
-
-void prefetch(const void* addr) {
-
-    #if defined(_MSC_VER)
-    _mm_prefetch((char const*) addr, _MM_HINT_T0);
-    #else
-    __builtin_prefetch(addr);
-    #endif
-}
-
-#endif
-
 #ifdef _WIN32
     #include <direct.h>
     #define GETCWD _getcwd
diff --git a/src/misc.h b/src/misc.h
index ce49a1f6553..34d8c443018 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -38,11 +38,6 @@ namespace Stockfish {
 std::string engine_info(bool to_uci = false);
 std::string compiler_info();
 
-// Preloads the given address in L1/L2 cache. This is a non-blocking
-// function that doesn't stall the CPU waiting for data to be loaded from memory,
-// which can be quite slow.
-void prefetch(const void* addr);
-
 void start_logger(const std::string& fname);
 
 size_t str_to_size_t(const std::string& s);
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 59a6149f0c4..c5cf5a40cc7 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -16,7 +16,17 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// Definition of layer AffineTransform of NNUE evaluation function
+// affine_transform.h contains the definition of AffineTransform layer.
+//
+// Following function(s) must be implemented in the architecture-specific
+// files:
+//
+//  AffineTransform::propagate
+//  AffineTransform::get_weight_index
+//
+// Following class(es) must be defined in the architecture-specific files:
+//
+//  AffineTransformSparseInput<InDims, OutDims>
 
 #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
 #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
@@ -24,105 +34,10 @@
 #include <cstdint>
 #include <iostream>
 
-#include "../nnue_common.h"
-#include "simd.h"
-
-/*
-  This file contains the definition for a fully connected layer (aka affine transform).
-
-    - expected use-case is for when PaddedInputDimensions == 32 and InputDimensions <= 32.
-      - that's why AVX512 is hard to implement
-    - expected use-case is small layers
-    - inputs are processed in chunks of 4, weights are respectively transposed
-    - accumulation happens directly to int32s
-*/
+#include "nnue/nnue_common.h"
 
 namespace Stockfish::Eval::NNUE::Layers {
 
-#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
-    #define ENABLE_SEQ_OPT
-#endif
-
-// Fallback implementation for older/other architectures.
-// Requires the input to be padded to at least 16 values.
-#ifndef ENABLE_SEQ_OPT
-
-template<IndexType InputDimensions, IndexType PaddedInputDimensions, IndexType OutputDimensions>
-static void affine_transform_non_ssse3(std::int32_t*       output,
-                                       const std::int8_t*  weights,
-                                       const std::int32_t* biases,
-                                       const std::uint8_t* input) {
-    #if defined(USE_SSE2) || defined(USE_NEON)
-        #if defined(USE_SSE2)
-    // At least a multiple of 16, with SSE2.
-    constexpr IndexType NumChunks   = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
-    const __m128i       Zeros       = _mm_setzero_si128();
-    const auto          inputVector = reinterpret_cast<const __m128i*>(input);
-
-        #elif defined(USE_NEON)
-    constexpr IndexType NumChunks   = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
-    const auto          inputVector = reinterpret_cast<const int8x8_t*>(input);
-        #endif
-
-    for (IndexType i = 0; i < OutputDimensions; ++i)
-    {
-        const IndexType offset = i * PaddedInputDimensions;
-
-        #if defined(USE_SSE2)
-        __m128i    sumLo = _mm_cvtsi32_si128(biases[i]);
-        __m128i    sumHi = Zeros;
-        const auto row   = reinterpret_cast<const __m128i*>(&weights[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j)
-        {
-            __m128i row_j           = _mm_load_si128(&row[j]);
-            __m128i input_j         = _mm_load_si128(&inputVector[j]);
-            __m128i extendedRowLo   = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8);
-            __m128i extendedRowHi   = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8);
-            __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros);
-            __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros);
-            __m128i productLo       = _mm_madd_epi16(extendedRowLo, extendedInputLo);
-            __m128i productHi       = _mm_madd_epi16(extendedRowHi, extendedInputHi);
-            sumLo                   = _mm_add_epi32(sumLo, productLo);
-            sumHi                   = _mm_add_epi32(sumHi, productHi);
-        }
-        __m128i sum           = _mm_add_epi32(sumLo, sumHi);
-        __m128i sumHigh_64    = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum                   = _mm_add_epi32(sum, sumHigh_64);
-        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
-        sum                   = _mm_add_epi32(sum, sum_second_32);
-        output[i]             = _mm_cvtsi128_si32(sum);
-
-        #elif defined(USE_NEON)
-
-        int32x4_t  sum = {biases[i]};
-        const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j)
-        {
-            int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]);
-            product           = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]);
-            sum               = vpadalq_s16(sum, product);
-        }
-        output[i] = sum[0] + sum[1] + sum[2] + sum[3];
-
-        #endif
-    }
-    #else
-    std::memcpy(output, biases, sizeof(std::int32_t) * OutputDimensions);
-
-    // Traverse weights in transpose order to take advantage of input sparsity
-    for (IndexType i = 0; i < InputDimensions; ++i)
-        if (input[i])
-        {
-            const std::int8_t* w  = &weights[i];
-            const int          in = input[i];
-            for (IndexType j = 0; j < OutputDimensions; ++j)
-                output[j] += w[j * PaddedInputDimensions] * in;
-        }
-    #endif
-}
-
-#endif  // !ENABLE_SEQ_OPT
-
 template<IndexType InDims, IndexType OutDims>
 class AffineTransform {
    public:
@@ -133,11 +48,12 @@ class AffineTransform {
     // Number of input/output dimensions
     static constexpr IndexType InputDimensions  = InDims;
     static constexpr IndexType OutputDimensions = OutDims;
+    static_assert(InputDimensions > 0 && OutputDimensions > 0);
 
     static constexpr IndexType PaddedInputDimensions =
-      ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+      ceil_to_multiple<IndexType>(InputDimensions, DimensionPadding);
     static constexpr IndexType PaddedOutputDimensions =
-      ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
+      ceil_to_multiple<IndexType>(OutputDimensions, DimensionPadding);
 
     using OutputBuffer = OutputType[PaddedOutputDimensions];
 
@@ -150,19 +66,6 @@ class AffineTransform {
         return hashValue;
     }
 
-    static constexpr IndexType get_weight_index_scrambled(IndexType i) {
-        return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4
-             + i / PaddedInputDimensions * 4 + i % 4;
-    }
-
-    static constexpr IndexType get_weight_index(IndexType i) {
-#ifdef ENABLE_SEQ_OPT
-        return get_weight_index_scrambled(i);
-#else
-        return i;
-#endif
-    }
-
     // Read network parameters
     bool read_parameters(std::istream& stream) {
         read_little_endian<BiasType>(stream, biases, OutputDimensions);
@@ -181,126 +84,51 @@ class AffineTransform {
 
         return !stream.fail();
     }
+
     // Forward propagation
-    void propagate(const InputType* input, OutputType* output) const {
-
-#ifdef ENABLE_SEQ_OPT
-
-        if constexpr (OutputDimensions > 1)
-        {
-    #if defined(USE_AVX512)
-            using vec_t = __m512i;
-        #define vec_set_32 _mm512_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
-    #elif defined(USE_AVX2)
-            using vec_t = __m256i;
-        #define vec_set_32 _mm256_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
-    #elif defined(USE_SSSE3)
-            using vec_t = __m128i;
-        #define vec_set_32 _mm_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
-    #elif defined(USE_NEON_DOTPROD)
-            using vec_t = int32x4_t;
-        #define vec_set_32 vdupq_n_s32
-        #define vec_add_dpbusd_32(acc, a, b) \
-            Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
-                                                vreinterpretq_s8_s32(b))
-    #endif
-
-            static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
-
-            static_assert(OutputDimensions % OutputSimdWidth == 0);
-
-            constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
-            constexpr IndexType NumRegs   = OutputDimensions / OutputSimdWidth;
-
-            const auto   input32 = reinterpret_cast<const std::int32_t*>(input);
-            const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
-            vec_t        acc[NumRegs];
-            for (IndexType k = 0; k < NumRegs; ++k)
-                acc[k] = biasvec[k];
-
-            for (IndexType i = 0; i < NumChunks; ++i)
-            {
-                const vec_t in0 = vec_set_32(input32[i]);
-                const auto  col0 =
-                  reinterpret_cast<const vec_t*>(&weights[i * OutputDimensions * 4]);
-
-                for (IndexType k = 0; k < NumRegs; ++k)
-                    vec_add_dpbusd_32(acc[k], in0, col0[k]);
-            }
-
-            vec_t* outptr = reinterpret_cast<vec_t*>(output);
-            for (IndexType k = 0; k < NumRegs; ++k)
-                outptr[k] = acc[k];
-
-    #undef vec_set_32
-    #undef vec_add_dpbusd_32
-        }
-        else if constexpr (OutputDimensions == 1)
-        {
-    // We cannot use AVX512 for the last layer because there are only 32 inputs
-    // and the buffer is not padded to 64 elements.
-    #if defined(USE_AVX2)
-            using vec_t = __m256i;
-        #define vec_setzero() _mm256_setzero_si256()
-        #define vec_set_32 _mm256_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
-        #define vec_hadd Simd::m256_hadd
-    #elif defined(USE_SSSE3)
-            using vec_t = __m128i;
-        #define vec_setzero() _mm_setzero_si128()
-        #define vec_set_32 _mm_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
-        #define vec_hadd Simd::m128_hadd
-    #elif defined(USE_NEON_DOTPROD)
-            using vec_t = int32x4_t;
-        #define vec_setzero() vdupq_n_s32(0)
-        #define vec_set_32 vdupq_n_s32
-        #define vec_add_dpbusd_32(acc, a, b) \
-            Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \
-                                                vreinterpretq_s8_s32(b))
-        #define vec_hadd Simd::neon_m128_hadd
-    #endif
-
-            const auto inputVector = reinterpret_cast<const vec_t*>(input);
-
-            static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType);
-
-            static_assert(PaddedInputDimensions % InputSimdWidth == 0);
-
-            constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth;
-            vec_t               sum0      = vec_setzero();
-            const auto          row0      = reinterpret_cast<const vec_t*>(&weights[0]);
-
-            for (int j = 0; j < int(NumChunks); ++j)
-            {
-                const vec_t in = inputVector[j];
-                vec_add_dpbusd_32(sum0, in, row0[j]);
-            }
-            output[0] = vec_hadd(sum0, biases[0]);
-
-    #undef vec_setzero
-    #undef vec_set_32
-    #undef vec_add_dpbusd_32
-    #undef vec_hadd
-        }
-#else
-        // Use old implementation for the other architectures.
-        affine_transform_non_ssse3<InputDimensions, PaddedInputDimensions, OutputDimensions>(
-          output, weights, biases, input);
-#endif
-    }
+    void propagate(const InputType* input, OutputType* output) const;
 
-   private:
+   protected:
     using BiasType   = OutputType;
     using WeightType = std::int8_t;
 
+    static constexpr IndexType get_weight_index(IndexType i);
+
     alignas(CacheLineSize) BiasType biases[OutputDimensions];
     alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
 };
 
 }  // namespace Stockfish::Eval::NNUE::Layers
 
-#endif  // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
+// This macro is used to inherit types and constexpr values from
+// AffineTransform class in case implementation details define specialized
+// AffineTransformSparseInput class.
+#define __DEFINE_BASE_PROPERTIES \
+    using Base = AffineTransform<InDims, OutDims>; \
+    using Base::biases, Base::weights; \
+\
+   public: \
+    using typename Base::InputType, typename Base::OutputType, typename Base::OutputBuffer; \
+    using typename Base::BiasType, typename Base::WeightType; \
+    using Base::InputDimensions, Base::OutputDimensions, Base::PaddedInputDimensions, \
+      Base::PaddedOutputDimensions;
+
+#if defined(__i386__) || defined(__amd64__)
+
+    #include "arch/i386/nnue/layers/affine_transform.h"
+    #include "arch/i386/nnue/layers/affine_transform_sparse_input.h"
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+    #include "arch/arm/nnue/layers/affine_transform.h"
+    #include "arch/arm/nnue/layers/affine_transform_sparse_input.h"
+
+#else
+
+    #include "arch/generic/nnue/layers/affine_transform.h"
+
+#endif
+
+#undef __DEFINE_BASE_PROPERTIES
+
+#endif  // NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED
diff --git a/src/nnue/layers/affine_transform_sparse_input.h b/src/nnue/layers/affine_transform_sparse_input.h
deleted file mode 100644
index 0ac557abac2..00000000000
--- a/src/nnue/layers/affine_transform_sparse_input.h
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
-
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-// Definition of layer AffineTransformSparseInput of NNUE evaluation function
-
-#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
-#define NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
-
-#include <algorithm>
-#include <array>
-#include <cstdint>
-#include <iostream>
-
-#include "../../bitboard.h"
-#include "../nnue_common.h"
-#include "affine_transform.h"
-#include "simd.h"
-
-/*
-  This file contains the definition for a fully connected layer (aka affine transform) with block sparse input.
-*/
-
-namespace Stockfish::Eval::NNUE::Layers {
-
-#if (USE_SSSE3 | (USE_NEON >= 8))
-alignas(CacheLineSize) static inline const
-  std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = []() {
-      std::array<std::array<std::uint16_t, 8>, 256> v{};
-      for (unsigned i = 0; i < 256; ++i)
-      {
-          std::uint64_t j = i, k = 0;
-          while (j)
-              v[i][k++] = pop_lsb(j);
-      }
-      return v;
-  }();
-
-// Find indices of nonzero numbers in an int32_t array
-template<const IndexType InputDimensions>
-void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) {
-    #if defined(USE_SSSE3)
-        #if defined(USE_AVX512)
-    using vec_t = __m512i;
-            #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
-        #elif defined(USE_AVX2)
-    using vec_t = __m256i;
-            #if defined(USE_VNNI) && !defined(USE_AVXVNNI)
-                #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
-            #else
-                #define vec_nnz(a) \
-                    _mm256_movemask_ps( \
-                      _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
-            #endif
-        #elif defined(USE_SSSE3)
-    using vec_t = __m128i;
-            #define vec_nnz(a) \
-                _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
-        #endif
-    using vec128_t = __m128i;
-        #define vec128_zero _mm_setzero_si128()
-        #define vec128_set_16(a) _mm_set1_epi16(a)
-        #define vec128_load(a) _mm_load_si128(a)
-        #define vec128_storeu(a, b) _mm_storeu_si128(a, b)
-        #define vec128_add(a, b) _mm_add_epi16(a, b)
-    #elif defined(USE_NEON)
-    using vec_t                        = uint32x4_t;
-    static const std::uint32_t Mask[4] = {1, 2, 4, 8};
-        #define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask)))
-    using vec128_t                     = uint16x8_t;
-        #define vec128_zero vdupq_n_u16(0)
-        #define vec128_set_16(a) vdupq_n_u16(a)
-        #define vec128_load(a) vld1q_u16(reinterpret_cast<const std::uint16_t*>(a))
-        #define vec128_storeu(a, b) vst1q_u16(reinterpret_cast<std::uint16_t*>(a), b)
-        #define vec128_add(a, b) vaddq_u16(a, b)
-    #endif
-    constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t);
-    // Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8)
-    constexpr IndexType ChunkSize       = std::max<IndexType>(InputSimdWidth, 8);
-    constexpr IndexType NumChunks       = InputDimensions / ChunkSize;
-    constexpr IndexType InputsPerChunk  = ChunkSize / InputSimdWidth;
-    constexpr IndexType OutputsPerChunk = ChunkSize / 8;
-
-    const auto     inputVector = reinterpret_cast<const vec_t*>(input);
-    IndexType      count       = 0;
-    vec128_t       base        = vec128_zero;
-    const vec128_t increment   = vec128_set_16(8);
-    for (IndexType i = 0; i < NumChunks; ++i)
-    {
-        // bitmask of nonzero values in this chunk
-        unsigned nnz = 0;
-        for (IndexType j = 0; j < InputsPerChunk; ++j)
-        {
-            const vec_t inputChunk = inputVector[i * InputsPerChunk + j];
-            nnz |= unsigned(vec_nnz(inputChunk)) << (j * InputSimdWidth);
-        }
-        for (IndexType j = 0; j < OutputsPerChunk; ++j)
-        {
-            const auto lookup = (nnz >> (j * 8)) & 0xFF;
-            const auto offsets =
-              vec128_load(reinterpret_cast<const vec128_t*>(&lookup_indices[lookup]));
-            vec128_storeu(reinterpret_cast<vec128_t*>(out + count), vec128_add(base, offsets));
-            count += popcount(lookup);
-            base = vec128_add(base, increment);
-        }
-    }
-    count_out = count;
-}
-    #undef vec_nnz
-    #undef vec128_zero
-    #undef vec128_set_16
-    #undef vec128_load
-    #undef vec128_storeu
-    #undef vec128_add
-#endif
-
-// Sparse input implementation
-template<IndexType InDims, IndexType OutDims>
-class AffineTransformSparseInput {
-   public:
-    // Input/output type
-    using InputType  = std::uint8_t;
-    using OutputType = std::int32_t;
-
-    // Number of input/output dimensions
-    static constexpr IndexType InputDimensions  = InDims;
-    static constexpr IndexType OutputDimensions = OutDims;
-
-    static_assert(OutputDimensions % 16 == 0,
-                  "Only implemented for OutputDimensions divisible by 16.");
-
-    static constexpr IndexType PaddedInputDimensions =
-      ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
-    static constexpr IndexType PaddedOutputDimensions =
-      ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
-
-#if (USE_SSSE3 | (USE_NEON >= 8))
-    static constexpr IndexType ChunkSize = 4;
-#else
-    static constexpr IndexType ChunkSize = 1;
-#endif
-
-    using OutputBuffer = OutputType[PaddedOutputDimensions];
-
-    // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
-        std::uint32_t hashValue = 0xCC03DAE4u;
-        hashValue += OutputDimensions;
-        hashValue ^= prevHash >> 1;
-        hashValue ^= prevHash << 31;
-        return hashValue;
-    }
-
-    static constexpr IndexType get_weight_index_scrambled(IndexType i) {
-        return (i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize
-             + i / PaddedInputDimensions * ChunkSize + i % ChunkSize;
-    }
-
-    static constexpr IndexType get_weight_index(IndexType i) {
-#if (USE_SSSE3 | (USE_NEON >= 8))
-        return get_weight_index_scrambled(i);
-#else
-        return i;
-#endif
-    }
-
-    // Read network parameters
-    bool read_parameters(std::istream& stream) {
-        read_little_endian<BiasType>(stream, biases, OutputDimensions);
-        for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
-            weights[get_weight_index(i)] = read_little_endian<WeightType>(stream);
-
-        return !stream.fail();
-    }
-
-    // Write network parameters
-    bool write_parameters(std::ostream& stream) const {
-        write_little_endian<BiasType>(stream, biases, OutputDimensions);
-
-        for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
-            write_little_endian<WeightType>(stream, weights[get_weight_index(i)]);
-
-        return !stream.fail();
-    }
-    // Forward propagation
-    void propagate(const InputType* input, OutputType* output) const {
-
-#if (USE_SSSE3 | (USE_NEON >= 8))
-    #if defined(USE_AVX512)
-        using invec_t  = __m512i;
-        using outvec_t = __m512i;
-        #define vec_set_32 _mm512_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
-    #elif defined(USE_AVX2)
-        using invec_t  = __m256i;
-        using outvec_t = __m256i;
-        #define vec_set_32 _mm256_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
-    #elif defined(USE_SSSE3)
-        using invec_t  = __m128i;
-        using outvec_t = __m128i;
-        #define vec_set_32 _mm_set1_epi32
-        #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
-    #elif defined(USE_NEON_DOTPROD)
-        using invec_t  = int8x16_t;
-        using outvec_t = int32x4_t;
-        #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
-        #define vec_add_dpbusd_32 Simd::dotprod_m128_add_dpbusd_epi32
-    #elif defined(USE_NEON)
-        using invec_t  = int8x16_t;
-        using outvec_t = int32x4_t;
-        #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
-        #define vec_add_dpbusd_32 Simd::neon_m128_add_dpbusd_epi32
-    #endif
-        static constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType);
-
-        constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / ChunkSize;
-        constexpr IndexType NumRegs   = OutputDimensions / OutputSimdWidth;
-        std::uint16_t       nnz[NumChunks];
-        IndexType           count;
-
-        const auto input32 = reinterpret_cast<const std::int32_t*>(input);
-
-        // Find indices of nonzero 32-bit blocks
-        find_nnz<NumChunks>(input32, nnz, count);
-
-        const outvec_t* biasvec = reinterpret_cast<const outvec_t*>(biases);
-        outvec_t        acc[NumRegs];
-        for (IndexType k = 0; k < NumRegs; ++k)
-            acc[k] = biasvec[k];
-
-        for (IndexType j = 0; j < count; ++j)
-        {
-            const auto    i  = nnz[j];
-            const invec_t in = vec_set_32(input32[i]);
-            const auto    col =
-              reinterpret_cast<const invec_t*>(&weights[i * OutputDimensions * ChunkSize]);
-            for (IndexType k = 0; k < NumRegs; ++k)
-                vec_add_dpbusd_32(acc[k], in, col[k]);
-        }
-
-        outvec_t* outptr = reinterpret_cast<outvec_t*>(output);
-        for (IndexType k = 0; k < NumRegs; ++k)
-            outptr[k] = acc[k];
-    #undef vec_set_32
-    #undef vec_add_dpbusd_32
-#else
-        // Use dense implementation for the other architectures.
-        affine_transform_non_ssse3<InputDimensions, PaddedInputDimensions, OutputDimensions>(
-          output, weights, biases, input);
-#endif
-    }
-
-   private:
-    using BiasType   = OutputType;
-    using WeightType = std::int8_t;
-
-    alignas(CacheLineSize) BiasType biases[OutputDimensions];
-    alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
-};
-
-}  // namespace Stockfish::Eval::NNUE::Layers
-
-#endif  // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h
index 2ee378ad881..daf3b5c3909 100644
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -16,20 +16,23 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// Definition of layer ClippedReLU of NNUE evaluation function
+// clipped_relu.h contains the definition of ClippedReLU layer.
+//
+// Following function(s) must be implemented in the architecture-specific
+// files:
+//
+//  ClippedReLU::propagate
 
 #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
 #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
 
-#include <algorithm>
 #include <cstdint>
 #include <iosfwd>
 
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
 
 namespace Stockfish::Eval::NNUE::Layers {
 
-// Clipped ReLU
 template<IndexType InDims>
 class ClippedReLU {
    public:
@@ -40,8 +43,10 @@ class ClippedReLU {
     // Number of input/output dimensions
     static constexpr IndexType InputDimensions  = InDims;
     static constexpr IndexType OutputDimensions = InputDimensions;
+    static_assert(InputDimensions > 0);
+
     static constexpr IndexType PaddedOutputDimensions =
-      ceil_to_multiple<IndexType>(OutputDimensions, 32);
+      ceil_to_multiple<IndexType>(OutputDimensions, DimensionPadding);
 
     using OutputBuffer = OutputType[PaddedOutputDimensions];
 
@@ -59,106 +64,23 @@ class ClippedReLU {
     bool write_parameters(std::ostream&) const { return true; }
 
     // Forward propagation
-    void propagate(const InputType* input, OutputType* output) const {
-
-#if defined(USE_AVX2)
-        if constexpr (InputDimensions % SimdWidth == 0)
-        {
-            constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-            const __m256i       Offsets   = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-            const auto          in        = reinterpret_cast<const __m256i*>(input);
-            const auto          out       = reinterpret_cast<__m256i*>(output);
-            for (IndexType i = 0; i < NumChunks; ++i)
-            {
-                const __m256i words0 =
-                  _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 0]),
-                                                        _mm256_load_si256(&in[i * 4 + 1])),
-                                    WeightScaleBits);
-                const __m256i words1 =
-                  _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 2]),
-                                                        _mm256_load_si256(&in[i * 4 + 3])),
-                                    WeightScaleBits);
-                _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(
-                                              _mm256_packs_epi16(words0, words1), Offsets));
-            }
-        }
-        else
-        {
-            constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
-            const auto          in        = reinterpret_cast<const __m128i*>(input);
-            const auto          out       = reinterpret_cast<__m128i*>(output);
-            for (IndexType i = 0; i < NumChunks; ++i)
-            {
-                const __m128i words0 = _mm_srli_epi16(
-                  _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
-                  WeightScaleBits);
-                const __m128i words1 = _mm_srli_epi16(
-                  _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
-                  WeightScaleBits);
-                _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
-            }
-        }
-        constexpr IndexType Start = InputDimensions % SimdWidth == 0
-                                    ? InputDimensions / SimdWidth * SimdWidth
-                                    : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
-
-#elif defined(USE_SSE2)
-        constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-
-    #ifndef USE_SSE41
-        const __m128i k0x80s = _mm_set1_epi8(-128);
-    #endif
-
-        const auto in  = reinterpret_cast<const __m128i*>(input);
-        const auto out = reinterpret_cast<__m128i*>(output);
-        for (IndexType i = 0; i < NumChunks; ++i)
-        {
-    #if defined(USE_SSE41)
-            const __m128i words0 = _mm_srli_epi16(
-              _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
-              WeightScaleBits);
-            const __m128i words1 = _mm_srli_epi16(
-              _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
-              WeightScaleBits);
-            _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
-    #else
-            const __m128i words0 = _mm_srai_epi16(
-              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
-              WeightScaleBits);
-            const __m128i words1 = _mm_srai_epi16(
-              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
-              WeightScaleBits);
-            const __m128i packedbytes = _mm_packs_epi16(words0, words1);
-            _mm_store_si128(&out[i], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
-    #endif
-        }
-        constexpr IndexType Start = NumChunks * SimdWidth;
-
-#elif defined(USE_NEON)
-        constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
-        const int8x8_t      Zero      = {0};
-        const auto          in        = reinterpret_cast<const int32x4_t*>(input);
-        const auto          out       = reinterpret_cast<int8x8_t*>(output);
-        for (IndexType i = 0; i < NumChunks; ++i)
-        {
-            int16x8_t  shifted;
-            const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
-            pack[0]         = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
-            pack[1]         = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
-            out[i]          = vmax_s8(vqmovn_s16(shifted), Zero);
-        }
-        constexpr IndexType Start = NumChunks * (SimdWidth / 2);
-#else
-        constexpr IndexType Start = 0;
-#endif
-
-        for (IndexType i = Start; i < InputDimensions; ++i)
-        {
-            output[i] = static_cast<OutputType>(std::clamp(input[i] >> WeightScaleBits, 0, 127));
-        }
-    }
+    void propagate(const InputType* input, OutputType* output) const;
 };
 
 }  // namespace Stockfish::Eval::NNUE::Layers
 
+#if defined(__i386__) || defined(__amd64__)
+
+    #include "arch/i386/nnue/layers/clipped_relu.h"
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+    #include "arch/arm/nnue/layers/clipped_relu.h"
+
+#else
+
+    #include "arch/generic/nnue/layers/clipped_relu.h"
+
+#endif
+
 #endif  // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
diff --git a/src/nnue/layers/simd.h b/src/nnue/layers/simd.h
deleted file mode 100644
index 55cb7df1421..00000000000
--- a/src/nnue/layers/simd.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
-
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef STOCKFISH_SIMD_H_INCLUDED
-#define STOCKFISH_SIMD_H_INCLUDED
-
-#if defined(USE_AVX2)
-    #include <immintrin.h>
-
-#elif defined(USE_SSE41)
-    #include <smmintrin.h>
-
-#elif defined(USE_SSSE3)
-    #include <tmmintrin.h>
-
-#elif defined(USE_SSE2)
-    #include <emmintrin.h>
-
-#elif defined(USE_NEON)
-    #include <arm_neon.h>
-#endif
-
-namespace Stockfish::Simd {
-
-#if defined(USE_AVX512)
-
-[[maybe_unused]] static int m512_hadd(__m512i sum, int bias) {
-    return _mm512_reduce_add_epi32(sum) + bias;
-}
-
-[[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) {
-
-    #if defined(USE_VNNI)
-    acc = _mm512_dpbusd_epi32(acc, a, b);
-    #else
-    __m512i product0 = _mm512_maddubs_epi16(a, b);
-    product0         = _mm512_madd_epi16(product0, _mm512_set1_epi16(1));
-    acc              = _mm512_add_epi32(acc, product0);
-    #endif
-}
-
-#endif
-
-#if defined(USE_AVX2)
-
-[[maybe_unused]] static int m256_hadd(__m256i sum, int bias) {
-    __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
-    sum128         = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
-    sum128         = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
-    return _mm_cvtsi128_si32(sum128) + bias;
-}
-
-[[maybe_unused]] static void m256_add_dpbusd_epi32(__m256i& acc, __m256i a, __m256i b) {
-
-    #if defined(USE_VNNI)
-    acc = _mm256_dpbusd_epi32(acc, a, b);
-    #else
-    __m256i product0 = _mm256_maddubs_epi16(a, b);
-    product0         = _mm256_madd_epi16(product0, _mm256_set1_epi16(1));
-    acc              = _mm256_add_epi32(acc, product0);
-    #endif
-}
-
-#endif
-
-#if defined(USE_SSSE3)
-
-[[maybe_unused]] static int m128_hadd(__m128i sum, int bias) {
-    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));  //_MM_PERM_BADC
-    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));  //_MM_PERM_CDAB
-    return _mm_cvtsi128_si32(sum) + bias;
-}
-
-[[maybe_unused]] static void m128_add_dpbusd_epi32(__m128i& acc, __m128i a, __m128i b) {
-
-    __m128i product0 = _mm_maddubs_epi16(a, b);
-    product0         = _mm_madd_epi16(product0, _mm_set1_epi16(1));
-    acc              = _mm_add_epi32(acc, product0);
-}
-
-#endif
-
-#if defined(USE_NEON_DOTPROD)
-
-[[maybe_unused]] static void
-dotprod_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
-
-    acc = vdotq_s32(acc, a, b);
-}
-#endif
-
-#if defined(USE_NEON)
-
-[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
-    #if USE_NEON >= 8
-    return vaddvq_s32(s);
-    #else
-    return s[0] + s[1] + s[2] + s[3];
-    #endif
-}
-
-[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
-    return neon_m128_reduce_add_epi32(sum) + bias;
-}
-
-#endif
-
-#if USE_NEON >= 8
-[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) {
-
-    int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
-    int16x8_t product1 = vmull_high_s8(a, b);
-    int16x8_t sum      = vpaddq_s16(product0, product1);
-    acc                = vpadalq_s16(acc, sum);
-}
-#endif
-}
-
-#endif  // STOCKFISH_SIMD_H_INCLUDED
diff --git a/src/nnue/layers/sqr_clipped_relu.h b/src/nnue/layers/sqr_clipped_relu.h
index 9c20df9d6f5..e755be85377 100644
--- a/src/nnue/layers/sqr_clipped_relu.h
+++ b/src/nnue/layers/sqr_clipped_relu.h
@@ -16,20 +16,23 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// Definition of layer ClippedReLU of NNUE evaluation function
+// sqr_clipped_relu.h contains the definition of SqrClippedReLU layer.
+//
+// Following function(s) must be implemented in the architecture-specific
+// files:
+//
+//  SqrClippedReLU::propagate
 
 #ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
 #define NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
 
-#include <algorithm>
 #include <cstdint>
 #include <iosfwd>
 
-#include "../nnue_common.h"
+#include "nnue/nnue_common.h"
 
 namespace Stockfish::Eval::NNUE::Layers {
 
-// Clipped ReLU
 template<IndexType InDims>
 class SqrClippedReLU {
    public:
@@ -40,8 +43,10 @@ class SqrClippedReLU {
     // Number of input/output dimensions
     static constexpr IndexType InputDimensions  = InDims;
     static constexpr IndexType OutputDimensions = InputDimensions;
+    static_assert(InputDimensions > 0);
+
     static constexpr IndexType PaddedOutputDimensions =
-      ceil_to_multiple<IndexType>(OutputDimensions, 32);
+      ceil_to_multiple<IndexType>(OutputDimensions, DimensionPadding);
 
     using OutputBuffer = OutputType[PaddedOutputDimensions];
 
@@ -59,45 +64,23 @@ class SqrClippedReLU {
     bool write_parameters(std::ostream&) const { return true; }
 
     // Forward propagation
-    void propagate(const InputType* input, OutputType* output) const {
-
-#if defined(USE_SSE2)
-        constexpr IndexType NumChunks = InputDimensions / 16;
-
-        static_assert(WeightScaleBits == 6);
-        const auto in  = reinterpret_cast<const __m128i*>(input);
-        const auto out = reinterpret_cast<__m128i*>(output);
-        for (IndexType i = 0; i < NumChunks; ++i)
-        {
-            __m128i words0 =
-              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1]));
-            __m128i words1 =
-              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3]));
-
-            // We shift by WeightScaleBits * 2 = 12 and divide by 128
-            // which is an additional shift-right of 7, meaning 19 in total.
-            // MulHi strips the lower 16 bits so we need to shift out 3 more to match.
-            words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
-            words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
-
-            _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
-        }
-        constexpr IndexType Start = NumChunks * 16;
+    void propagate(const InputType* input, OutputType* output) const;
+};
+
+}  // namespace Stockfish::Eval::NNUE::Layers
+
+#if defined(__i386__) || defined(__amd64__)
+
+    #include "arch/i386/nnue/layers/sqr_clipped_relu.h"
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+    #include "arch/arm/nnue/layers/sqr_clipped_relu.h"
 
 #else
-        constexpr IndexType Start = 0;
-#endif
 
-        for (IndexType i = Start; i < InputDimensions; ++i)
-        {
-            output[i] = static_cast<OutputType>(
-              // Really should be /127 but we need to make it fast so we right-shift
-              // by an extra 7 bits instead. Needs to be accounted for in the trainer.
-              std::min(127ll, ((long long) (input[i]) * input[i]) >> (2 * WeightScaleBits + 7)));
-        }
-    }
-};
+    #include "arch/generic/nnue/layers/sqr_clipped_relu.h"
 
-}  // namespace Stockfish::Eval::NNUE::Layers
+#endif
 
 #endif  // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp
index f7d2cc6ada0..6f863cc4f84 100644
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -98,7 +98,7 @@ bool read_parameters(std::istream& stream, T& reference) {
 
 // Write evaluation function parameters
 template<typename T>
-bool write_parameters(std::ostream& stream, const T& reference) {
+bool write_parameters(std::ostream& stream, T& reference) {
 
     write_little_endian<std::uint32_t>(stream, T::get_hash_value());
     return reference.write_parameters(stream);
@@ -174,7 +174,7 @@ void Network<Arch, Transformer>::load(const std::string& rootDirectory, std::str
 
 
 template<typename Arch, typename Transformer>
-bool Network<Arch, Transformer>::save(const std::optional<std::string>& filename) const {
+bool Network<Arch, Transformer>::save(const std::optional<std::string>& filename) {
     std::string actualFilename;
     std::string msg;
 
@@ -210,17 +210,14 @@ Network<Arch, Transformer>::evaluate(const Position&                         pos
                                      AccumulatorCaches::Cache<FTDimensions>* cache) const {
     // We manually align the arrays on the stack because with gcc < 9.3
     // overaligning stack variables with alignas() doesn't work correctly.
-
-    constexpr uint64_t alignment = CacheLineSize;
-
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
     TransformedFeatureType
       transformedFeaturesUnaligned[FeatureTransformer<FTDimensions, nullptr>::BufferSize
-                                   + alignment / sizeof(TransformedFeatureType)];
+                                   + CacheLineSize / sizeof(TransformedFeatureType)];
 
-    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
+    auto* transformedFeatures = align_ptr_up<CacheLineSize>(&transformedFeaturesUnaligned[0]);
 #else
-    alignas(alignment) TransformedFeatureType
+    alignas(CacheLineSize) TransformedFeatureType
       transformedFeatures[FeatureTransformer<FTDimensions, nullptr>::BufferSize];
 #endif
 
@@ -360,7 +357,7 @@ void Network<Arch, Transformer>::initialize() {
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::save(std::ostream&      stream,
                                       const std::string& name,
-                                      const std::string& netDescription) const {
+                                      const std::string& netDescription) {
     if (name.empty() || name == "None")
         return false;
 
@@ -410,7 +407,7 @@ bool Network<Arch, Transformer>::write_header(std::ostream&      stream,
 
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::read_parameters(std::istream& stream,
-                                                 std::string&  netDescription) const {
+                                                 std::string&  netDescription) {
     std::uint32_t hashValue;
     if (!read_header(stream, &hashValue, &netDescription))
         return false;
@@ -429,7 +426,7 @@ bool Network<Arch, Transformer>::read_parameters(std::istream& stream,
 
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::write_parameters(std::ostream&      stream,
-                                                  const std::string& netDescription) const {
+                                                  const std::string& netDescription) {
     if (!write_header(stream, Network::hash, netDescription))
         return false;
     if (!Detail::write_parameters(stream, *featureTransformer))
@@ -444,12 +441,8 @@ bool Network<Arch, Transformer>::write_parameters(std::ostream&      stream,
 
 // Explicit template instantiation
 
-template class Network<
-  NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>,
-  FeatureTransformer<TransformedFeatureDimensionsBig, &StateInfo::accumulatorBig>>;
+template class Network<BigNetworkArchitecture, BigFeatureTransformer>;
 
-template class Network<
-  NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>,
-  FeatureTransformer<TransformedFeatureDimensionsSmall, &StateInfo::accumulatorSmall>>;
+template class Network<SmallNetworkArchitecture, SmallFeatureTransformer>;
 
 }  // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/network.h b/src/nnue/network.h
index 152082552c9..6008b11ddfe 100644
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@@ -59,7 +59,7 @@ class Network {
     Network& operator=(Network&& other) = default;
 
     void load(const std::string& rootDirectory, std::string evalfilePath);
-    bool save(const std::optional<std::string>& filename) const;
+    bool save(const std::optional<std::string>& filename);
 
     NetworkOutput evaluate(const Position&                         pos,
                            AccumulatorCaches::Cache<FTDimensions>* cache) const;
@@ -78,14 +78,14 @@ class Network {
 
     void initialize();
 
-    bool                       save(std::ostream&, const std::string&, const std::string&) const;
+    bool                       save(std::ostream&, const std::string&, const std::string&);
     std::optional<std::string> load(std::istream&);
 
     bool read_header(std::istream&, std::uint32_t*, std::string*) const;
     bool write_header(std::ostream&, std::uint32_t, const std::string&) const;
 
-    bool read_parameters(std::istream&, std::string&) const;
-    bool write_parameters(std::ostream&, const std::string&) const;
+    bool read_parameters(std::istream&, std::string&);
+    bool write_parameters(std::ostream&, const std::string&);
 
     // Input feature converter
     LargePagePtr<Transformer> featureTransformer;
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index b8dcf1e480f..be25f3f7aa6 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -21,6 +21,7 @@
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED
 
+#include <array>
 #include <cstdint>
 
 #include "nnue_architecture.h"
@@ -28,10 +29,6 @@
 
 namespace Stockfish::Eval::NNUE {
 
-using BiasType       = std::int16_t;
-using PSQTWeightType = std::int32_t;
-using IndexType      = std::uint32_t;
-
 // Class that holds the result of affine transformation of input features
 template<IndexType Size>
 struct alignas(CacheLineSize) Accumulator {
@@ -56,6 +53,8 @@ struct AccumulatorCaches {
 
     template<IndexType Size>
     struct alignas(CacheLineSize) Cache {
+        using BiasType       = FeatureTransformerBiasType;
+        using PSQTWeightType = FeatureTransformerPSQTWeightType;
 
         struct alignas(CacheLineSize) Entry {
             BiasType       accumulation[Size];
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
index 7f73f87fd5e..ea443b38ed2 100644
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -27,7 +27,6 @@
 
 #include "features/half_ka_v2_hm.h"
 #include "layers/affine_transform.h"
-#include "layers/affine_transform_sparse_input.h"
 #include "layers/clipped_relu.h"
 #include "layers/sqr_clipped_relu.h"
 #include "nnue_common.h"
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 4bc3408f18a..774be5093e2 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -16,7 +16,7 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// Constants used in NNUE evaluation function
+// Common constants and functions used in NNUE evaluation function
 
 #ifndef NNUE_COMMON_H_INCLUDED
 #define NNUE_COMMON_H_INCLUDED
@@ -28,55 +28,32 @@
 #include <iostream>
 #include <type_traits>
 
-#include "../misc.h"
-
-#if defined(USE_AVX2)
-    #include <immintrin.h>
-
-#elif defined(USE_SSE41)
-    #include <smmintrin.h>
-
-#elif defined(USE_SSSE3)
-    #include <tmmintrin.h>
-
-#elif defined(USE_SSE2)
-    #include <emmintrin.h>
-
-#elif defined(USE_NEON)
-    #include <arm_neon.h>
-#endif
+#include "misc.h"
 
 namespace Stockfish::Eval::NNUE {
 
 // Version of the evaluation file
 constexpr std::uint32_t Version = 0x7AF32F20u;
 
-// Constant used in evaluation value calculation
-constexpr int OutputScale     = 16;
-constexpr int WeightScaleBits = 6;
+using IndexType              = std::uint32_t;
+using TransformedFeatureType = std::uint8_t;
+
+// Types used in the feature transformer and accumulator cache entries
+using FeatureTransformerBiasType       = std::int16_t;
+using FeatureTransformerPSQTWeightType = std::int32_t;
 
 // Size of cache line (in bytes)
 constexpr std::size_t CacheLineSize = 64;
 
-constexpr const char        Leb128MagicString[]   = "COMPRESSED_LEB128";
-constexpr const std::size_t Leb128MagicStringSize = sizeof(Leb128MagicString) - 1;
-
-// SIMD width (in bytes)
-#if defined(USE_AVX2)
-constexpr std::size_t SimdWidth = 32;
+// Padding to dimensions of input/output buffers across layers
+constexpr std::size_t DimensionPadding = 32;
 
-#elif defined(USE_SSE2)
-constexpr std::size_t SimdWidth = 16;
-
-#elif defined(USE_NEON)
-constexpr std::size_t SimdWidth = 16;
-#endif
-
-constexpr std::size_t MaxSimdWidth = 32;
+// Constant used in evaluation value calculation
+constexpr int OutputScale     = 16;
+constexpr int WeightScaleBits = 6;
 
-// Type of input feature after conversion
-using TransformedFeatureType = std::uint8_t;
-using IndexType              = std::uint32_t;
+constexpr const char  Leb128MagicString[]   = "COMPRESSED_LEB128";
+constexpr std::size_t Leb128MagicStringSize = sizeof(Leb128MagicString) - 1;
 
 // Round n up to be a multiple of base
 template<typename IntType>
@@ -84,6 +61,26 @@ constexpr IntType ceil_to_multiple(IntType n, IntType base) {
     return (n + base - 1) / base * base;
 }
 
+template<std::size_t RegisterSize, int NumRegisters, std::size_t LaneSize, int NumLanes>
+constexpr int optimal_register_count() {
+    static_assert(RegisterSize > 0 && LaneSize > 0 && NumLanes > 0);
+    static_assert(RegisterSize >= LaneSize && RegisterSize % LaneSize == 0);
+    static_assert((NumLanes * LaneSize) % RegisterSize == 0);
+
+    // The exact number of registers that can fit in the whole input vectors.
+    constexpr int Ideal = (NumLanes * LaneSize) / RegisterSize;
+
+    if constexpr (Ideal <= NumRegisters)
+        return Ideal;
+
+    // Look for the largest divisor of the ideal register count that is
+    // smaller than NumRegisters.
+    for (int divisor = NumRegisters; divisor > 1; --divisor)
+        if (Ideal % divisor == 0)
+            return divisor;
+
+    return 1;
+}
 
 // Utility to read an integer (signed or unsigned, any size)
 // from a stream in little-endian order. We swap the byte order after the read if
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index fa180678d89..70953368cc3 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -16,7 +16,16 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-// A class that converts the input features of the NNUE evaluation function
+// nnue_feature_transformer.h contains the definition of FeatureTransformer
+// class, which converts a position into NNUE input features.
+//
+// Following function(s) must be implemented in the architecture-specific
+// files:
+//
+//  FeatureTransformer::permute_weights
+//  FeatureTransformer::apply_accumulator_updates_incremental
+//  FeatureTransformer::apply_accumulator_updates_refresh_cache
+//  FeatureTransformer::convert_accumulators
 
 #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
 #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
@@ -24,190 +33,23 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <cstring>
 #include <iosfwd>
 #include <utility>
 
-#include "../position.h"
-#include "../types.h"
-#include "nnue_accumulator.h"
-#include "nnue_architecture.h"
-#include "nnue_common.h"
+#include "position.h"
+#include "types.h"
+#include "nnue/nnue_accumulator.h"
+#include "nnue/nnue_architecture.h"
+#include "nnue/nnue_common.h"
 
 namespace Stockfish::Eval::NNUE {
 
-using BiasType       = std::int16_t;
-using WeightType     = std::int16_t;
-using PSQTWeightType = std::int32_t;
-
-// If vector instructions are enabled, we update and refresh the
-// accumulator tile by tile such that each tile fits in the CPU's
-// vector registers.
-#define VECTOR
-
-static_assert(PSQTBuckets % 8 == 0,
-              "Per feature PSQT values cannot be processed at granularity lower than 8 at a time.");
-
-#ifdef USE_AVX512
-using vec_t      = __m512i;
-using psqt_vec_t = __m256i;
-    #define vec_load(a) _mm512_load_si512(a)
-    #define vec_store(a, b) _mm512_store_si512(a, b)
-    #define vec_add_16(a, b) _mm512_add_epi16(a, b)
-    #define vec_sub_16(a, b) _mm512_sub_epi16(a, b)
-    #define vec_mulhi_16(a, b) _mm512_mulhi_epi16(a, b)
-    #define vec_zero() _mm512_setzero_epi32()
-    #define vec_set_16(a) _mm512_set1_epi16(a)
-    #define vec_max_16(a, b) _mm512_max_epi16(a, b)
-    #define vec_min_16(a, b) _mm512_min_epi16(a, b)
-    #define vec_slli_16(a, b) _mm512_slli_epi16(a, b)
-    // Inverse permuted at load time
-    #define vec_packus_16(a, b) _mm512_packus_epi16(a, b)
-    #define vec_load_psqt(a) _mm256_load_si256(a)
-    #define vec_store_psqt(a, b) _mm256_store_si256(a, b)
-    #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b)
-    #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b)
-    #define vec_zero_psqt() _mm256_setzero_si256()
-    #define NumRegistersSIMD 16
-    #define MaxChunkSize 64
-
-#elif USE_AVX2
-using vec_t      = __m256i;
-using psqt_vec_t = __m256i;
-    #define vec_load(a) _mm256_load_si256(a)
-    #define vec_store(a, b) _mm256_store_si256(a, b)
-    #define vec_add_16(a, b) _mm256_add_epi16(a, b)
-    #define vec_sub_16(a, b) _mm256_sub_epi16(a, b)
-    #define vec_mulhi_16(a, b) _mm256_mulhi_epi16(a, b)
-    #define vec_zero() _mm256_setzero_si256()
-    #define vec_set_16(a) _mm256_set1_epi16(a)
-    #define vec_max_16(a, b) _mm256_max_epi16(a, b)
-    #define vec_min_16(a, b) _mm256_min_epi16(a, b)
-    #define vec_slli_16(a, b) _mm256_slli_epi16(a, b)
-    // Inverse permuted at load time
-    #define vec_packus_16(a, b) _mm256_packus_epi16(a, b)
-    #define vec_load_psqt(a) _mm256_load_si256(a)
-    #define vec_store_psqt(a, b) _mm256_store_si256(a, b)
-    #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b)
-    #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b)
-    #define vec_zero_psqt() _mm256_setzero_si256()
-    #define NumRegistersSIMD 16
-    #define MaxChunkSize 32
-
-#elif USE_SSE2
-using vec_t      = __m128i;
-using psqt_vec_t = __m128i;
-    #define vec_load(a) (*(a))
-    #define vec_store(a, b) *(a) = (b)
-    #define vec_add_16(a, b) _mm_add_epi16(a, b)
-    #define vec_sub_16(a, b) _mm_sub_epi16(a, b)
-    #define vec_mulhi_16(a, b) _mm_mulhi_epi16(a, b)
-    #define vec_zero() _mm_setzero_si128()
-    #define vec_set_16(a) _mm_set1_epi16(a)
-    #define vec_max_16(a, b) _mm_max_epi16(a, b)
-    #define vec_min_16(a, b) _mm_min_epi16(a, b)
-    #define vec_slli_16(a, b) _mm_slli_epi16(a, b)
-    #define vec_packus_16(a, b) _mm_packus_epi16(a, b)
-    #define vec_load_psqt(a) (*(a))
-    #define vec_store_psqt(a, b) *(a) = (b)
-    #define vec_add_psqt_32(a, b) _mm_add_epi32(a, b)
-    #define vec_sub_psqt_32(a, b) _mm_sub_epi32(a, b)
-    #define vec_zero_psqt() _mm_setzero_si128()
-    #define NumRegistersSIMD (Is64Bit ? 16 : 8)
-    #define MaxChunkSize 16
-
-#elif USE_NEON
-using vec_t      = int16x8_t;
-using psqt_vec_t = int32x4_t;
-    #define vec_load(a) (*(a))
-    #define vec_store(a, b) *(a) = (b)
-    #define vec_add_16(a, b) vaddq_s16(a, b)
-    #define vec_sub_16(a, b) vsubq_s16(a, b)
-    #define vec_mulhi_16(a, b) vqdmulhq_s16(a, b)
-    #define vec_zero() \
-        vec_t { 0 }
-    #define vec_set_16(a) vdupq_n_s16(a)
-    #define vec_max_16(a, b) vmaxq_s16(a, b)
-    #define vec_min_16(a, b) vminq_s16(a, b)
-    #define vec_slli_16(a, b) vshlq_s16(a, vec_set_16(b))
-    #define vec_packus_16(a, b) reinterpret_cast<vec_t>(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b)))
-    #define vec_load_psqt(a) (*(a))
-    #define vec_store_psqt(a, b) *(a) = (b)
-    #define vec_add_psqt_32(a, b) vaddq_s32(a, b)
-    #define vec_sub_psqt_32(a, b) vsubq_s32(a, b)
-    #define vec_zero_psqt() \
-        psqt_vec_t { 0 }
-    #define NumRegistersSIMD 16
-    #define MaxChunkSize 16
-
-#else
-    #undef VECTOR
-
-#endif
-
-
-#ifdef VECTOR
-
-    // Compute optimal SIMD register count for feature transformer accumulation.
-
-    // We use __m* types as template arguments, which causes GCC to emit warnings
-    // about losing some attribute information. This is irrelevant to us as we
-    // only take their size, so the following pragma are harmless.
-    #if defined(__GNUC__)
-        #pragma GCC diagnostic push
-        #pragma GCC diagnostic ignored "-Wignored-attributes"
-    #endif
-
-template<typename SIMDRegisterType, typename LaneType, int NumLanes, int MaxRegisters>
-static constexpr int BestRegisterCount() {
-    #define RegisterSize sizeof(SIMDRegisterType)
-    #define LaneSize sizeof(LaneType)
-
-    static_assert(RegisterSize >= LaneSize);
-    static_assert(MaxRegisters <= NumRegistersSIMD);
-    static_assert(MaxRegisters > 0);
-    static_assert(NumRegistersSIMD > 0);
-    static_assert(RegisterSize % LaneSize == 0);
-    static_assert((NumLanes * LaneSize) % RegisterSize == 0);
-
-    const int ideal = (NumLanes * LaneSize) / RegisterSize;
-    if (ideal <= MaxRegisters)
-        return ideal;
-
-    // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters
-    for (int divisor = MaxRegisters; divisor > 1; --divisor)
-        if (ideal % divisor == 0)
-            return divisor;
-
-    return 1;
-}
-    #if defined(__GNUC__)
-        #pragma GCC diagnostic pop
-    #endif
-#endif
-
-
-// Input feature converter
 template<IndexType                                 TransformedFeatureDimensions,
          Accumulator<TransformedFeatureDimensions> StateInfo::*accPtr>
 class FeatureTransformer {
-
     // Number of output dimensions for one side
     static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
 
-   private:
-#ifdef VECTOR
-    static constexpr int NumRegs =
-      BestRegisterCount<vec_t, WeightType, TransformedFeatureDimensions, NumRegistersSIMD>();
-    static constexpr int NumPsqtRegs =
-      BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
-
-    static constexpr IndexType TileHeight     = NumRegs * sizeof(vec_t) / 2;
-    static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
-    static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
-    static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets");
-#endif
-
    public:
     // Output type
     using OutputType = TransformedFeatureType;
@@ -224,97 +66,28 @@ class FeatureTransformer {
         return FeatureSet::HashValue ^ (OutputDimensions * 2);
     }
 
-    static constexpr void order_packs([[maybe_unused]] uint64_t* v) {
-#if defined(USE_AVX512)  // _mm512_packs_epi16 ordering
-        uint64_t tmp0 = v[2], tmp1 = v[3];
-        v[2] = v[8], v[3] = v[9];
-        v[8] = v[4], v[9] = v[5];
-        v[4] = tmp0, v[5] = tmp1;
-        tmp0 = v[6], tmp1 = v[7];
-        v[6] = v[10], v[7] = v[11];
-        v[10] = v[12], v[11] = v[13];
-        v[12] = tmp0, v[13] = tmp1;
-#elif defined(USE_AVX2)  // _mm256_packs_epi16 ordering
-        std::swap(v[2], v[4]);
-        std::swap(v[3], v[5]);
-#endif
-    }
-
-    static constexpr void inverse_order_packs([[maybe_unused]] uint64_t* v) {
-#if defined(USE_AVX512)  // Inverse _mm512_packs_epi16 ordering
-        uint64_t tmp0 = v[2], tmp1 = v[3];
-        v[2] = v[4], v[3] = v[5];
-        v[4] = v[8], v[5] = v[9];
-        v[8] = tmp0, v[9] = tmp1;
-        tmp0 = v[6], tmp1 = v[7];
-        v[6] = v[12], v[7] = v[13];
-        v[12] = v[10], v[13] = v[11];
-        v[10] = tmp0, v[11] = tmp1;
-#elif defined(USE_AVX2)  // Inverse _mm256_packs_epi16 ordering
-        std::swap(v[2], v[4]);
-        std::swap(v[3], v[5]);
-#endif
-    }
-
-    void permute_weights([[maybe_unused]] void (*order_fn)(uint64_t*)) const {
-#if defined(USE_AVX2)
-    #if defined(USE_AVX512)
-        constexpr IndexType di = 16;
-    #else
-        constexpr IndexType di = 8;
-    #endif
-        uint64_t* b = reinterpret_cast<uint64_t*>(const_cast<BiasType*>(&biases[0]));
-        for (IndexType i = 0; i < HalfDimensions * sizeof(BiasType) / sizeof(uint64_t); i += di)
-            order_fn(&b[i]);
-
-        for (IndexType j = 0; j < InputDimensions; ++j)
-        {
-            uint64_t* w =
-              reinterpret_cast<uint64_t*>(const_cast<WeightType*>(&weights[j * HalfDimensions]));
-            for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(uint64_t);
-                 i += di)
-                order_fn(&w[i]);
-        }
-#endif
-    }
-
-    inline void scale_weights(bool read) const {
-        for (IndexType j = 0; j < InputDimensions; ++j)
-        {
-            WeightType* w = const_cast<WeightType*>(&weights[j * HalfDimensions]);
-            for (IndexType i = 0; i < HalfDimensions; ++i)
-                w[i] = read ? w[i] * 2 : w[i] / 2;
-        }
-
-        BiasType* b = const_cast<BiasType*>(biases);
-        for (IndexType i = 0; i < HalfDimensions; ++i)
-            b[i] = read ? b[i] * 2 : b[i] / 2;
-    }
-
     // Read network parameters
     bool read_parameters(std::istream& stream) {
-
         read_leb_128<BiasType>(stream, biases, HalfDimensions);
         read_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
         read_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
 
-        permute_weights(inverse_order_packs);
-        scale_weights(true);
+        permute_weights<false>();
+        scale_weights<false>();
         return !stream.fail();
     }
 
     // Write network parameters
-    bool write_parameters(std::ostream& stream) const {
-
-        permute_weights(order_packs);
-        scale_weights(false);
+    bool write_parameters(std::ostream& stream) {
+        permute_weights<true>();
+        scale_weights<true>();
 
         write_leb_128<BiasType>(stream, biases, HalfDimensions);
         write_leb_128<WeightType>(stream, weights, HalfDimensions * InputDimensions);
         write_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
 
-        permute_weights(inverse_order_packs);
-        scale_weights(true);
+        permute_weights<false>();
+        scale_weights<false>();
         return !stream.fail();
     }
 
@@ -323,135 +96,51 @@ class FeatureTransformer {
                            AccumulatorCaches::Cache<HalfDimensions>* cache,
                            OutputType*                               output,
                            int                                       bucket) const {
+
         update_accumulator<WHITE>(pos, cache);
         update_accumulator<BLACK>(pos, cache);
+        convert_accumulators(pos, output);
 
-        const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
         const auto& psqtAccumulation = (pos.state()->*accPtr).psqtAccumulation;
-        const auto  psqt =
-          (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket])
-          / 2;
-
-        const auto& accumulation = (pos.state()->*accPtr).accumulation;
 
-        for (IndexType p = 0; p < 2; ++p)
-        {
-            const IndexType offset = (HalfDimensions / 2) * p;
-
-#if defined(VECTOR)
-
-            constexpr IndexType OutputChunkSize = MaxChunkSize;
-            static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
-            constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
-
-            const vec_t Zero = vec_zero();
-            const vec_t One  = vec_set_16(127 * 2);
-
-            const vec_t* in0 = reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][0]));
-            const vec_t* in1 =
-              reinterpret_cast<const vec_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
-            vec_t* out = reinterpret_cast<vec_t*>(output + offset);
-
-            // Per the NNUE architecture, here we want to multiply pairs of
-            // clipped elements and divide the product by 128. To do this,
-            // we can naively perform min/max operation to clip each of the
-            // four int16 vectors, mullo pairs together, then pack them into
-            // one int8 vector. However, there exists a faster way.
-
-            // The idea here is to use the implicit clipping from packus to
-            // save us two vec_max_16 instructions. This clipping works due
-            // to the fact that any int16 integer below zero will be zeroed
-            // on packus.
-
-            // Consider the case where the second element is negative.
-            // If we do standard clipping, that element will be zero, which
-            // means our pairwise product is zero. If we perform packus and
-            // remove the lower-side clip for the second element, then our
-            // product before packus will be negative, and is zeroed on pack.
-            // The two operation produce equivalent results, but the second
-            // one (using packus) saves one max operation per pair.
-
-            // But here we run into a problem: mullo does not preserve the
-            // sign of the multiplication. We can get around this by doing
-            // mulhi, which keeps the sign. But that requires an additional
-            // tweak.
-
-            // mulhi cuts off the last 16 bits of the resulting product,
-            // which is the same as performing a rightward shift of 16 bits.
-            // We can use this to our advantage. Recall that we want to
-            // divide the final product by 128, which is equivalent to a
-            // 7-bit right shift. Intuitively, if we shift the clipped
-            // value left by 9, and perform mulhi, which shifts the product
-            // right by 16 bits, then we will net a right shift of 7 bits.
-            // However, this won't work as intended. Since we clip the
-            // values to have a maximum value of 127, shifting it by 9 bits
-            // might occupy the signed bit, resulting in some positive
-            // values being interpreted as negative after the shift.
-
-            // There is a way, however, to get around this limitation. When
-            // loading the network, scale accumulator weights and biases by
-            // 2. To get the same pairwise multiplication result as before,
-            // we need to divide the product by 128 * 2 * 2 = 512, which
-            // amounts to a right shift of 9 bits. So now we only have to
-            // shift left by 7 bits, perform mulhi (shifts right by 16 bits)
-            // and net a 9 bit right shift. Since we scaled everything by
-            // two, the values are clipped at 127 * 2 = 254, which occupies
-            // 8 bits. Shifting it by 7 bits left will no longer occupy the
-            // signed bit, so we are safe.
-
-            // Note that on NEON processors, we shift left by 6 instead
-            // because the instruction "vqdmulhq_s16" also doubles the
-            // return value after the multiplication, adding an extra shift
-            // to the left by 1, so we compensate by shifting less before
-            // the multiplication.
-
-            constexpr int shift =
-    #if defined(USE_SSE2)
-              7;
-    #else
-              6;
-    #endif
-
-            for (IndexType j = 0; j < NumOutputChunks; ++j)
-            {
-                const vec_t sum0a =
-                  vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero), shift);
-                const vec_t sum0b =
-                  vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero), shift);
-                const vec_t sum1a = vec_min_16(in1[j * 2 + 0], One);
-                const vec_t sum1b = vec_min_16(in1[j * 2 + 1], One);
+        return (psqtAccumulation[pos.side_to_move()][bucket]
+                - psqtAccumulation[~pos.side_to_move()][bucket])
+             / 2;
+    }
 
-                const vec_t pa = vec_mulhi_16(sum0a, sum1a);
-                const vec_t pb = vec_mulhi_16(sum0b, sum1b);
+    void hint_common_access(const Position&                           pos,
+                            AccumulatorCaches::Cache<HalfDimensions>* cache) const {
+        hint_common_access_for_perspective<WHITE>(pos, cache);
+        hint_common_access_for_perspective<BLACK>(pos, cache);
+    }
 
-                out[j] = vec_packus_16(pa, pb);
-            }
+   private:
+    using BiasType       = FeatureTransformerBiasType;
+    using WeightType     = std::int16_t;
+    using PSQTWeightType = FeatureTransformerPSQTWeightType;
 
-#else
+    // Stores constants and types based on the target architecture.
+    struct Details;
 
-            for (IndexType j = 0; j < HalfDimensions / 2; ++j)
-            {
-                BiasType sum0 = accumulation[static_cast<int>(perspectives[p])][j + 0];
-                BiasType sum1 =
-                  accumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
-                sum0               = std::clamp<BiasType>(sum0, 0, 127 * 2);
-                sum1               = std::clamp<BiasType>(sum1, 0, 127 * 2);
-                output[offset + j] = static_cast<OutputType>(unsigned(sum0 * sum1) / 512);
-            }
+    template<bool Write>
+    inline void permute_weights();
 
-#endif
+    template<bool Write>
+    inline void scale_weights() {
+        for (IndexType j = 0; j < InputDimensions; ++j)
+        {
+            WeightType* w = &weights[j * HalfDimensions];
+            for (IndexType i = 0; i < HalfDimensions; ++i)
+                w[i] = Write ? w[i] / 2 : w[i] * 2;
         }
 
-        return psqt;
-    }  // end of function transform()
-
-    void hint_common_access(const Position&                           pos,
-                            AccumulatorCaches::Cache<HalfDimensions>* cache) const {
-        hint_common_access_for_perspective<WHITE>(pos, cache);
-        hint_common_access_for_perspective<BLACK>(pos, cache);
+        for (IndexType i = 0; i < HalfDimensions; ++i)
+            biases[i] = Write ? biases[i] / 2 : biases[i] * 2;
     }
 
-   private:
+    // Look for an accumulator of an earlier position. It traverses the linked
+    // list of states starting from the current position and goes back until it
+    // finds a computed accumulator or a state that requires a full refresh.
     template<Color Perspective>
     StateInfo* try_find_computed_accumulator(const Position& pos) const {
         // Look for a usable accumulator of an earlier position. We keep track
@@ -477,13 +166,6 @@ class FeatureTransformer {
         assert((computed->*accPtr).computed[Perspective]);
         assert(computed->next != nullptr);
 
-#ifdef VECTOR
-        // Gcc-10.2 unnecessarily spills AVX2 registers if this array
-        // is defined in the VECTOR code below, once in each branch.
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
-#endif
-
         const Square ksq = pos.square<KING>(Perspective);
 
         // The size must be enough to contain the largest possible update.
@@ -503,162 +185,8 @@ class FeatureTransformer {
         StateInfo* next = CurrentOnly ? pos.state() : computed->next;
         assert(!(next->*accPtr).computed[Perspective]);
 
-#ifdef VECTOR
-        if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1)
-        {
-            auto accIn =
-              reinterpret_cast<const vec_t*>(&(computed->*accPtr).accumulation[Perspective][0]);
-            auto accOut = reinterpret_cast<vec_t*>(&(next->*accPtr).accumulation[Perspective][0]);
-
-            const IndexType offsetR0 = HalfDimensions * removed[0];
-            auto            columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]);
-            const IndexType offsetA  = HalfDimensions * added[0];
-            auto            columnA  = reinterpret_cast<const vec_t*>(&weights[offsetA]);
-
-            if (removed.size() == 1)
-            {
-                for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                    accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA[i]);
-            }
-            else
-            {
-                const IndexType offsetR1 = HalfDimensions * removed[1];
-                auto            columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]);
-
-                for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
-                    accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA[i]),
-                                           vec_add_16(columnR0[i], columnR1[i]));
-            }
-
-            auto accPsqtIn = reinterpret_cast<const psqt_vec_t*>(
-              &(computed->*accPtr).psqtAccumulation[Perspective][0]);
-            auto accPsqtOut =
-              reinterpret_cast<psqt_vec_t*>(&(next->*accPtr).psqtAccumulation[Perspective][0]);
-
-            const IndexType offsetPsqtR0 = PSQTBuckets * removed[0];
-            auto columnPsqtR0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR0]);
-            const IndexType offsetPsqtA = PSQTBuckets * added[0];
-            auto columnPsqtA = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtA]);
-
-            if (removed.size() == 1)
-            {
-                for (std::size_t i = 0;
-                     i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i)
-                    accPsqtOut[i] = vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[i], columnPsqtR0[i]),
-                                                    columnPsqtA[i]);
-            }
-            else
-            {
-                const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
-                auto columnPsqtR1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR1]);
-
-                for (std::size_t i = 0;
-                     i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i)
-                    accPsqtOut[i] =
-                      vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA[i]),
-                                      vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i]));
-            }
-        }
-        else
-        {
-            for (IndexType i = 0; i < HalfDimensions / TileHeight; ++i)
-            {
-                // Load accumulator
-                auto accTileIn = reinterpret_cast<const vec_t*>(
-                  &(computed->*accPtr).accumulation[Perspective][i * TileHeight]);
-                for (IndexType j = 0; j < NumRegs; ++j)
-                    acc[j] = vec_load(&accTileIn[j]);
-
-                // Difference calculation for the deactivated features
-                for (const auto index : removed)
-                {
-                    const IndexType offset = HalfDimensions * index + i * TileHeight;
-                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-                    for (IndexType j = 0; j < NumRegs; ++j)
-                        acc[j] = vec_sub_16(acc[j], column[j]);
-                }
-
-                // Difference calculation for the activated features
-                for (const auto index : added)
-                {
-                    const IndexType offset = HalfDimensions * index + i * TileHeight;
-                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-                    for (IndexType j = 0; j < NumRegs; ++j)
-                        acc[j] = vec_add_16(acc[j], column[j]);
-                }
-
-                // Store accumulator
-                auto accTileOut = reinterpret_cast<vec_t*>(
-                  &(next->*accPtr).accumulation[Perspective][i * TileHeight]);
-                for (IndexType j = 0; j < NumRegs; ++j)
-                    vec_store(&accTileOut[j], acc[j]);
-            }
-
-            for (IndexType i = 0; i < PSQTBuckets / PsqtTileHeight; ++i)
-            {
-                // Load accumulator
-                auto accTilePsqtIn = reinterpret_cast<const psqt_vec_t*>(
-                  &(computed->*accPtr).psqtAccumulation[Perspective][i * PsqtTileHeight]);
-                for (std::size_t j = 0; j < NumPsqtRegs; ++j)
-                    psqt[j] = vec_load_psqt(&accTilePsqtIn[j]);
-
-                // Difference calculation for the deactivated features
-                for (const auto index : removed)
-                {
-                    const IndexType offset = PSQTBuckets * index + i * PsqtTileHeight;
-                    auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-                    for (std::size_t j = 0; j < NumPsqtRegs; ++j)
-                        psqt[j] = vec_sub_psqt_32(psqt[j], columnPsqt[j]);
-                }
-
-                // Difference calculation for the activated features
-                for (const auto index : added)
-                {
-                    const IndexType offset = PSQTBuckets * index + i * PsqtTileHeight;
-                    auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-                    for (std::size_t j = 0; j < NumPsqtRegs; ++j)
-                        psqt[j] = vec_add_psqt_32(psqt[j], columnPsqt[j]);
-                }
-
-                // Store accumulator
-                auto accTilePsqtOut = reinterpret_cast<psqt_vec_t*>(
-                  &(next->*accPtr).psqtAccumulation[Perspective][i * PsqtTileHeight]);
-                for (std::size_t j = 0; j < NumPsqtRegs; ++j)
-                    vec_store_psqt(&accTilePsqtOut[j], psqt[j]);
-            }
-        }
-#else
-        std::memcpy((next->*accPtr).accumulation[Perspective],
-                    (computed->*accPtr).accumulation[Perspective],
-                    HalfDimensions * sizeof(BiasType));
-        std::memcpy((next->*accPtr).psqtAccumulation[Perspective],
-                    (computed->*accPtr).psqtAccumulation[Perspective],
-                    PSQTBuckets * sizeof(PSQTWeightType));
-
-        // Difference calculation for the deactivated features
-        for (const auto index : removed)
-        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType i = 0; i < HalfDimensions; ++i)
-                (next->*accPtr).accumulation[Perspective][i] -= weights[offset + i];
-
-            for (std::size_t i = 0; i < PSQTBuckets; ++i)
-                (next->*accPtr).psqtAccumulation[Perspective][i] -=
-                  psqtWeights[index * PSQTBuckets + i];
-        }
-
-        // Difference calculation for the activated features
-        for (const auto index : added)
-        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType i = 0; i < HalfDimensions; ++i)
-                (next->*accPtr).accumulation[Perspective][i] += weights[offset + i];
-
-            for (std::size_t i = 0; i < PSQTBuckets; ++i)
-                (next->*accPtr).psqtAccumulation[Perspective][i] +=
-                  psqtWeights[index * PSQTBuckets + i];
-        }
-#endif
+        apply_accumulator_updates_incremental<Perspective, CurrentOnly>(computed, next, removed,
+                                                                        added);
 
         (next->*accPtr).computed[Perspective] = true;
 
@@ -666,15 +194,26 @@ class FeatureTransformer {
             update_accumulator_incremental<Perspective, false>(pos, next);
     }
 
+    template<Color Perspective, bool CurrentOnly>
+    inline void apply_accumulator_updates_incremental(StateInfo*             computed_st,
+                                                      StateInfo*             next,
+                                                      FeatureSet::IndexList& removed,
+                                                      FeatureSet::IndexList& added) const;
+
+    // Update the accumluator for the current position and refresh the cache.
+    //
+    // Instead of rebuilding the accumulator from scratch, the accumulator is
+    // updated by applying the differences between it and the cached one.
     template<Color Perspective>
     void update_accumulator_refresh_cache(const Position&                           pos,
                                           AccumulatorCaches::Cache<HalfDimensions>* cache) const {
         assert(cache != nullptr);
 
-        Square                ksq   = pos.square<KING>(Perspective);
-        auto&                 entry = (*cache)[ksq][Perspective];
         FeatureSet::IndexList removed, added;
 
+        const Square ksq   = pos.square<KING>(Perspective);
+        auto&        entry = (*cache)[ksq][Perspective];
+
         for (Color c : {WHITE, BLACK})
         {
             for (PieceType pt = PAWN; pt <= KING; ++pt)
@@ -698,133 +237,24 @@ class FeatureTransformer {
             }
         }
 
-        auto& accumulator                 = pos.state()->*accPtr;
-        accumulator.computed[Perspective] = true;
-
-#ifdef VECTOR
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
-
-        for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
-        {
-            auto accTile =
-              reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
-            auto entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[j * TileHeight]);
-
-            for (IndexType k = 0; k < NumRegs; ++k)
-                acc[k] = entryTile[k];
-
-            int i = 0;
-            for (; i < int(std::min(removed.size(), added.size())); ++i)
-            {
-                IndexType       indexR  = removed[i];
-                const IndexType offsetR = HalfDimensions * indexR + j * TileHeight;
-                auto            columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]);
-                IndexType       indexA  = added[i];
-                const IndexType offsetA = HalfDimensions * indexA + j * TileHeight;
-                auto            columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
-
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k]));
-            }
-            for (; i < int(removed.size()); ++i)
-            {
-                IndexType       index  = removed[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_sub_16(acc[k], column[k]);
-            }
-            for (; i < int(added.size()); ++i)
-            {
-                IndexType       index  = added[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], column[k]);
-            }
-
-            for (IndexType k = 0; k < NumRegs; k++)
-                vec_store(&entryTile[k], acc[k]);
-            for (IndexType k = 0; k < NumRegs; k++)
-                vec_store(&accTile[k], acc[k]);
-        }
-
-        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
-        {
-            auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
-            auto entryTilePsqt =
-              reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * PsqtTileHeight]);
-
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                psqt[k] = entryTilePsqt[k];
-
-            for (int i = 0; i < int(removed.size()); ++i)
-            {
-                IndexType       index  = removed[i];
-                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
-                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
-            }
-            for (int i = 0; i < int(added.size()); ++i)
-            {
-                IndexType       index  = added[i];
-                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
-                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
-            }
-
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&entryTilePsqt[k], psqt[k]);
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&accTilePsqt[k], psqt[k]);
-        }
-
-#else
-
-        for (const auto index : removed)
-        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[j] -= weights[offset + j];
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[k] -= psqtWeights[index * PSQTBuckets + k];
-        }
-        for (const auto index : added)
-        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[j] += weights[offset + j];
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[k] += psqtWeights[index * PSQTBuckets + k];
-        }
-
-        // The accumulator of the refresh entry has been updated.
-        // Now copy its content to the actual accumulator we were refreshing.
-
-        std::memcpy(accumulator.accumulation[Perspective], entry.accumulation,
-                    sizeof(BiasType) * HalfDimensions);
-
-        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation,
-                    sizeof(int32_t) * PSQTBuckets);
-#endif
-
         for (Color c : {WHITE, BLACK})
             entry.byColorBB[c] = pos.pieces(c);
 
         for (PieceType pt = PAWN; pt <= KING; ++pt)
             entry.byTypeBB[pt] = pos.pieces(pt);
+
+        auto& accumulator = pos.state()->*accPtr;
+        apply_accumulator_updates_refresh_cache<Perspective>(accumulator, entry, removed, added);
+        accumulator.computed[Perspective] = true;
     }
 
+    template<Color Perspective>
+    inline void apply_accumulator_updates_refresh_cache(
+      Accumulator<TransformedFeatureDimensions>&                accumulator,
+      typename AccumulatorCaches::Cache<HalfDimensions>::Entry& entry,
+      FeatureSet::IndexList                                     removed,
+      FeatureSet::IndexList                                     added) const;
+
     template<Color Perspective>
     void hint_common_access_for_perspective(const Position&                           pos,
                                             AccumulatorCaches::Cache<HalfDimensions>* cache) const {
@@ -860,6 +290,9 @@ class FeatureTransformer {
             update_accumulator_refresh_cache<Perspective>(pos, cache);
     }
 
+    // Called in transform after both accumulators are updated.
+    inline void convert_accumulators(const Position& pos, OutputType* output) const;
+
     template<IndexType Size>
     friend struct AccumulatorCaches::Cache;
 
@@ -870,4 +303,18 @@ class FeatureTransformer {
 
 }  // namespace Stockfish::Eval::NNUE
 
-#endif  // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#if defined(__i386__) || defined(__amd64__)
+
+    #include "arch/i386/nnue/nnue_feature_transformer.h"
+
+#elif defined(__arm__) || defined(__aarch64__)
+
+    #include "arch/arm/nnue/nnue_feature_transformer.h"
+
+#else
+
+    #include "arch/generic/nnue/nnue_feature_transformer.h"
+
+#endif
+
+#endif  // NNUE_FEATURE_TRANSFORMER_H_INCLUDED
diff --git a/src/position.cpp b/src/position.cpp
index f596b015355..760c2d3ccb5 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) {
     for (Bitboard b = pos.checkers(); b;)
         os << UCIEngine::square(pop_lsb(b)) << " ";
 
-    if (int(Tablebases::MaxCardinality) >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
+    if (Tablebases::MaxCardinality >= int(popcount(pos.pieces())) && !pos.can_castle(ANY_CASTLING))
     {
         StateInfo st;
         ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize);
@@ -1328,7 +1328,7 @@ bool Position::pos_is_ok() const {
 
 
     for (Piece pc : Pieces)
-        if (pieceCount[pc] != popcount(pieces(color_of(pc), type_of(pc)))
+        if (pieceCount[pc] != int(popcount(pieces(color_of(pc), type_of(pc))))
             || pieceCount[pc] != std::count(board, board + SQUARE_NB, pc))
             assert(0 && "pos_is_ok: Pieces");
 
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 9b24e700b18..e23235db5ff 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -709,7 +709,7 @@ int map_score(TBTable<DTZ>* entry, File f, int value, WDLScore wdl) {
 }
 
 // A temporary fix for the compiler bug with AVX-512. (#4450)
-#ifdef USE_AVX512
+#ifdef __AVX512F__
     #if defined(__clang__) && defined(__clang_major__) && __clang_major__ >= 15
         #define CLANG_AVX512_BUG_FIX __attribute__((optnone))
     #endif
@@ -1729,7 +1729,7 @@ Config Tablebases::rank_root_moves(const OptionsMap&  options,
         config.probeDepth  = 0;
     }
 
-    if (config.cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING))
+    if (config.cardinality >= int(popcount(pos.pieces())) && !pos.can_castle(ANY_CASTLING))
     {
         // Rank moves using DTZ tables
         config.rootInTB = root_probe(pos, rootMoves, options["Syzygy50MoveRule"], rankDTZ);
diff --git a/src/types.h b/src/types.h
index b12491d6cdd..5afc6f0c026 100644
--- a/src/types.h
+++ b/src/types.h
@@ -19,89 +19,11 @@
 #ifndef TYPES_H_INCLUDED
     #define TYPES_H_INCLUDED
 
-// When compiling with provided Makefile (e.g. for Linux and OSX), configuration
-// is done automatically. To get started type 'make help'.
-//
-// When Makefile is not used (e.g. with Microsoft Visual Studio) some switches
-// need to be set manually:
-//
-// -DNDEBUG      | Disable debugging mode. Always use this for release.
-//
-// -DNO_PREFETCH | Disable use of prefetch asm-instruction. You may need this to
-//               | run on some very old machines.
-//
-// -DUSE_POPCNT  | Add runtime support for use of popcnt asm-instruction. Works
-//               | only in 64-bit mode and requires hardware with popcnt support.
-//
-// -DUSE_PEXT    | Add runtime support for use of pext asm-instruction. Works
-//               | only in 64-bit mode and requires hardware with pext support.
-
     #include <cassert>
     #include <cstdint>
 
-    #if defined(_MSC_VER)
-        // Disable some silly and noisy warnings from MSVC compiler
-        #pragma warning(disable: 4127)  // Conditional expression is constant
-        #pragma warning(disable: 4146)  // Unary minus operator applied to unsigned type
-        #pragma warning(disable: 4800)  // Forcing value to bool 'true' or 'false'
-    #endif
-
-// Predefined macros hell:
-//
-// __GNUC__                Compiler is GCC, Clang or ICX
-// __clang__               Compiler is Clang or ICX
-// __INTEL_LLVM_COMPILER   Compiler is ICX
-// _MSC_VER                Compiler is MSVC
-// _WIN32                  Building on Windows (any)
-// _WIN64                  Building on Windows 64 bit
-
-    #if defined(__GNUC__) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) \
-      && defined(_WIN32) && !defined(__clang__)
-        #define ALIGNAS_ON_STACK_VARIABLES_BROKEN
-    #endif
-
-    #define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast<uintptr_t>(ptr) % alignment == 0)
-
-    #if defined(_WIN64) && defined(_MSC_VER)  // No Makefile used
-        #include <intrin.h>                   // Microsoft header for _BitScanForward64()
-        #define IS_64BIT
-    #endif
-
-    #if defined(USE_POPCNT) && defined(_MSC_VER)
-        #include <nmmintrin.h>  // Microsoft header for _mm_popcnt_u64()
-    #endif
-
-    #if !defined(NO_PREFETCH) && defined(_MSC_VER)
-        #include <xmmintrin.h>  // Microsoft header for _mm_prefetch()
-    #endif
-
-    #if defined(USE_PEXT)
-        #include <immintrin.h>  // Header for _pext_u64() intrinsic
-        #define pext(b, m) _pext_u64(b, m)
-    #else
-        #define pext(b, m) 0
-    #endif
-
 namespace Stockfish {
 
-    #ifdef USE_POPCNT
-constexpr bool HasPopCnt = true;
-    #else
-constexpr bool HasPopCnt = false;
-    #endif
-
-    #ifdef USE_PEXT
-constexpr bool HasPext = true;
-    #else
-constexpr bool HasPext = false;
-    #endif
-
-    #ifdef IS_64BIT
-constexpr bool Is64Bit = true;
-    #else
-constexpr bool Is64Bit = false;
-    #endif
-
 using Key      = uint64_t;
 using Bitboard = uint64_t;