diff --git a/scripts/get_native_properties.sh b/scripts/get_native_properties.sh deleted file mode 100755 index fb124021a31..00000000000 --- a/scripts/get_native_properties.sh +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/sh - -# -# Returns properties of the native system. -# best architecture as supported by the CPU -# filename of the best binary uploaded as an artifact during CI -# - -# Check if all the given flags are present in the CPU flags list -check_flags() { - for flag; do - printf '%s\n' "$flags" | grep -q -w "$flag" || return 1 - done -} - -# Set the CPU flags list -# remove underscores and points from flags, e.g. gcc uses avx512vnni, while some cpuinfo can have avx512_vnni, some systems use sse4_1 others sse4.1 -get_flags() { - flags=$(awk '/^flags[ \t]*:|^Features[ \t]*:/{gsub(/^flags[ \t]*:[ \t]*|^Features[ \t]*:[ \t]*|[_.]/, ""); line=$0} END{print line}' /proc/cpuinfo) -} - -# Check for gcc march "znver1" or "znver2" https://en.wikichip.org/wiki/amd/cpuid -check_znver_1_2() { - vendor_id=$(awk '/^vendor_id/{print $3; exit}' /proc/cpuinfo) - cpu_family=$(awk '/^cpu family/{print $4; exit}' /proc/cpuinfo) - [ "$vendor_id" = "AuthenticAMD" ] && [ "$cpu_family" = "23" ] && znver_1_2=true -} - -# Set the file CPU x86_64 architecture -set_arch_x86_64() { - if check_flags 'avx512vnni' 'avx512dq' 'avx512f' 'avx512bw' 'avx512vl'; then - true_arch='x86-64-vnni256' - elif check_flags 'avx512f' 'avx512bw'; then - true_arch='x86-64-avx512' - elif [ -z "${znver_1_2+1}" ] && check_flags 'bmi2'; then - true_arch='x86-64-bmi2' - elif check_flags 'avx2'; then - true_arch='x86-64-avx2' - elif check_flags 'sse41' && check_flags 'popcnt'; then - true_arch='x86-64-sse41-popcnt' - else - true_arch='x86-64' - fi -} - -# Check the system type -uname_s=$(uname -s) -uname_m=$(uname -m) -case $uname_s in - 'Darwin') # Mac OSX system - case $uname_m in - 'arm64') - true_arch='apple-silicon' - file_arch='x86-64-sse41-popcnt' # Supported by Rosetta 2 - ;; - 'x86_64') - flags=$(sysctl -n machdep.cpu.features machdep.cpu.leaf7_features | tr '\n' ' ' | tr '[:upper:]' '[:lower:]' | tr -d '_.') - set_arch_x86_64 - if [ "$true_arch" = 'x86-64-vnni256' ] || [ "$true_arch" = 'x86-64-avx512' ]; then - file_arch='x86-64-bmi2' - fi - ;; - esac - file_os='macos' - file_ext='tar' - ;; - 'Linux') # Linux system - get_flags - case $uname_m in - 'x86_64') - file_os='ubuntu' - check_znver_1_2 - set_arch_x86_64 - ;; - 'i686') - file_os='ubuntu' - true_arch='x86-32' - ;; - 'aarch64') - file_os='android' - true_arch='armv8' - if check_flags 'asimddp'; then - true_arch="$true_arch-dotprod" - fi - ;; - 'armv7'*) - file_os='android' - true_arch='armv7' - if check_flags 'neon'; then - true_arch="$true_arch-neon" - fi - ;; - *) # Unsupported machine type, exit with error - printf 'Unsupported machine type: %s\n' "$uname_m" - exit 1 - ;; - esac - file_ext='tar' - ;; - 'CYGWIN'*|'MINGW'*|'MSYS'*) # Windows system with POSIX compatibility layer - get_flags - check_znver_1_2 - set_arch_x86_64 - file_os='windows' - file_ext='zip' - ;; - *) - # Unknown system type, exit with error - printf 'Unsupported system type: %s\n' "$uname_s" - exit 1 - ;; -esac - -if [ -z "$file_arch" ]; then - file_arch=$true_arch -fi - -file_name="stockfish-$file_os-$file_arch.$file_ext" - -printf '%s %s\n' "$true_arch" "$file_name" diff --git a/src/Makefile b/src/Makefile index 042d9479cc8..87e93cb581b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -14,1035 +14,400 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +default: help -### ========================================================================== -### Section 1. General Configuration -### ========================================================================== +.PHONY: default help strip install clean objclean profileclean net format \ + all config-sanity analyze build profile-build \ + gcc-profile-make gcc-profile-use \ + clang-profile-make clang-profile-use \ + icx-profile-make icx-profile-use -### Establish the operating system name -KERNEL := $(shell uname -s) -ifeq ($(KERNEL),Linux) - OS := $(shell uname -o) -endif +VPATH = syzygy:nnue:nnue/features -### Target Windows OS -ifeq ($(OS),Windows_NT) - ifneq ($(COMP),ndk) - target_windows = yes - endif -else ifeq ($(COMP),mingw) - target_windows = yes - ifeq ($(WINE_PATH),) - WINE_PATH := $(shell which wine) - endif -endif +SRCS := $(shell find . -name "*.cpp" ! -path "./incbin/*") +OBJS := $(notdir $(SRCS:.cpp=.o)) +HEADERS := $(shell find . -name "*.h" ! -path "./incbin/*") + +INSTALL_PREFIX := /usr/local +INSTALL_PATH := $(INSTALL_PREFIX)/bin -### Executable name -ifeq ($(target_windows),yes) - EXE = stockfish.exe +ifeq ($(OS),Windows_NT) + INSTALL_EXE := stockfish.exe else - EXE = stockfish + INSTALL_EXE := stockfish endif -### Installation dir definitions -PREFIX = /usr/local -BINDIR = $(PREFIX)/bin - -### Built-in benchmark for pgo-builds -PGOBENCH = $(WINE_PATH) ./$(EXE) bench +KERNEL := $(shell uname -s) -### Source and object files -SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \ - misc.cpp movegen.cpp movepick.cpp position.cpp \ - search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \ - nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp engine.cpp score.cpp memory.cpp +strip: + -@test -f stockfish && strip stockfish + -@test -f stockfish.exe && strip stockfish.exe -HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \ - nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \ - nnue/layers/affine_transform_sparse_input.h nnue/layers/clipped_relu.h nnue/layers/simd.h \ - nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \ - nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \ - search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \ - tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h engine.h score.h numa.h memory.h +install: + mkdir -p -m 755 $(INSTALL_PATH) + cp $(INSTALL_EXE) $(INSTALL_PATH) + strip $(INSTALL_PATH)/$(INSTALL_EXE) -OBJS = $(notdir $(SRCS:.cpp=.o)) +clean: objclean profileclean + @rm -f .depend -VPATH = syzygy:nnue:nnue/features +objclean: + @rm -f stockfish stockfish.exe $(OBJS) *.o.tmp -### ========================================================================== -### Section 2. High-level Configuration -### ========================================================================== -# -# flag --- Comp switch --- Description -# ---------------------------------------------------------------------------- -# -# debug = yes/no --- -DNDEBUG --- Enable/Disable debug mode -# sanitize = none/ ... (-fsanitize ) -# --- ( undefined ) --- enable undefined behavior checks -# --- ( thread ) --- enable threading error checks -# --- ( address ) --- enable memory access checks -# --- ...etc... --- see compiler documentation for supported sanitizers -# optimize = yes/no --- (-O3/-fast etc.) --- Enable/Disable optimizations -# arch = (name) --- (-arch) --- Target architecture -# bits = 64/32 --- -DIS_64BIT --- 64-/32-bit operating system -# prefetch = yes/no --- -DUSE_PREFETCH --- Use prefetch asm-instruction -# popcnt = yes/no --- -DUSE_POPCNT --- Use popcnt asm-instruction -# pext = yes/no --- -DUSE_PEXT --- Use pext x86_64 asm-instruction -# sse = yes/no --- -msse --- Use Intel Streaming SIMD Extensions -# mmx = yes/no --- -mmmx --- Use Intel MMX instructions -# sse2 = yes/no --- -msse2 --- Use Intel Streaming SIMD Extensions 2 -# ssse3 = yes/no --- -mssse3 --- Use Intel Supplemental Streaming SIMD Extensions 3 -# sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1 -# avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 -# avxvnni = yes/no --- -mavxvnni --- Use Intel Vector Neural Network Instructions AVX -# avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 -# vnni256 = yes/no --- -mavx256vnni --- Use Intel Vector Neural Network Instructions 512 with 256bit operands -# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 -# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture -# dotprod = yes/no --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions -# -# Note that Makefile is space sensitive, so when adding new architectures -# or modifying existing flags, you have to make sure there are no extra spaces -# at the end of the line for flag values. -# -# Example of use for these flags: -# make build ARCH=x86-64-avx512 debug=yes sanitize="address undefined" +profileclean: + @rm -f PGOBENCH.out + @rm -rf profdir + @rm -f stockfish.profdata *.profraw +net: + @$(SHELL) ../scripts/net.sh -### 2.1. General and architecture defaults +format: CLANG_FORMAT := $(shell command -v clang-format-18 2> /dev/null || \ + command -v clang-format 2> /dev/null) +format: + @test -n "$(CLANG_FORMAT)" || ( \ + echo "clang-format not found. Please install clang-format-18."; false \ + ) + @$(CLANG_FORMAT) -i $(SRCS) $(HEADERS) -style=file -ifeq ($(ARCH),) - ARCH = native -endif +### ========================================================================== -ifeq ($(ARCH), native) - override ARCH := $(shell $(SHELL) ../scripts/get_native_properties.sh | cut -d " " -f 1) -endif +CXX_REQUIRED_RULES := analyze config-sanity build profile-build all \ + gcc-profile-make gcc-profile-use \ + clang-profile-make clang-profile-use \ + icx-profile-make icx-profile-use -# explicitly check for the list of supported architectures (as listed with make help), -# the user can override with `make ARCH=x86-32-vnni256 SUPPORTED_ARCH=true` -ifeq ($(ARCH), $(filter $(ARCH), \ - x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \ - x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \ - x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \ - armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64 loongarch64)) - SUPPORTED_ARCH=true -else - SUPPORTED_ARCH=false -endif +ifeq ($(MAKELEVEL),0) +ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),) optimize = yes debug = no sanitize = none -bits = 64 -prefetch = no -popcnt = no -pext = no -sse = no -mmx = no -sse2 = no -ssse3 = no -sse41 = no -avx2 = no -avxvnni = no -avx512 = no -vnni256 = no -vnni512 = no -neon = no -dotprod = no -arm_version = 0 -STRIP = strip - -ifneq ($(shell which clang-format-18 2> /dev/null),) - CLANG-FORMAT = clang-format-18 -else - CLANG-FORMAT = clang-format -endif - -### 2.2 Architecture specific - -ifeq ($(findstring x86,$(ARCH)),x86) -# x86-32/64 - -ifeq ($(findstring x86-32,$(ARCH)),x86-32) - arch = i386 - bits = 32 - sse = no - mmx = yes +ifeq ($(shell command -v $(CXX) 2> /dev/null),) + $(error Compiler $(CXX) not found) +endif + +define test-compiler-macro +$(shell echo | $(CXX) -dM -x c++ -E - | \ + grep -E "^#define[[:space:]]+$(1)$|([[:space:]]+.*)" > /dev/null 2>&1 && echo 1) +endef + +define get-compiler-macro +$(shell echo | $(CXX) -dM -x c++ -E - | \ + grep -E "^#define[[:space:]]+$(1)$|([[:space:]]+.*)" | \ + sed "s/^#define[[:space:]]\+$(1)[[:space:]]\+//") +endef + +### 1. Detect compiler type + +ifeq ($(call test-compiler-macro,__GNUC__),1) + ifeq ($(call test-compiler-macro,__INTEL_LLVM_COMPILER),1) + $(info Using Intel oneAPI DPC++/C++ Compiler) $(info ) + COMP := icx + profile_make = icx-profile-make + profile_use = icx-profile-use + else ifeq ($(call test-compiler-macro,__clang__),1) + $(info Using LLVM C/C++ Compiler (Clang)) $(info ) + COMP := clang + CLANG_VERSION := $(call get-compiler-macro,__clang_major__) + LLVM_PROFDATA := $(shell command -v llvm-profdata-$(CLANG_VERSION) 2> /dev/null || \ + command -v llvm-profdata 2> /dev/null) + profile_make = clang-profile-make + profile_use = clang-profile-use + export LLVM_PROFDATA + else + $(info Using GNU C/C++ Compiler) $(info ) + COMP := gcc + GCC_VERSION := $(call get-compiler-macro,__GNUC__) + profile_make = gcc-profile-make + profile_use = gcc-profile-use + endif +endif + +ifneq ($(filter $(COMP),gcc clang),) + MINGW := $(call test-compiler-macro,__MINGW32__) +endif + +ifeq ($(MINGW),1) + EXE = stockfish.exe else - arch = x86_64 - sse = yes - sse2 = yes -endif - -ifeq ($(findstring -sse,$(ARCH)),-sse) - sse = yes -endif - -ifeq ($(findstring -popcnt,$(ARCH)),-popcnt) - popcnt = yes + EXE = stockfish endif -ifeq ($(findstring -mmx,$(ARCH)),-mmx) - mmx = yes -endif - -ifeq ($(findstring -sse2,$(ARCH)),-sse2) - sse = yes - sse2 = yes -endif +export COMP MINGW EXE -ifeq ($(findstring -ssse3,$(ARCH)),-ssse3) - sse = yes - sse2 = yes - ssse3 = yes -endif - -ifeq ($(findstring -sse41,$(ARCH)),-sse41) - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes -endif +### 2. Set compiler options -ifeq ($(findstring -modern,$(ARCH)),-modern) - $(warning *** ARCH=$(ARCH) is deprecated, defaulting to ARCH=x86-64-sse41-popcnt. Execute `make help` for a list of available architectures. ***) - $(shell sleep 5) - popcnt = yes - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes -endif - -ifeq ($(findstring -avx2,$(ARCH)),-avx2) - popcnt = yes - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes - avx2 = yes -endif +# GNU C Compiler +# https://gcc.gnu.org/onlinedocs/gcc/Option-Index.html +# +# Clang Compiler +# https://clang.llvm.org/docs/ClangCommandLineReference.html +# https://clang.llvm.org/docs/DiagnosticsReference.html +# +# Intel oneAPI DPC++/C++ Compiler +# https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/developer-guide-reference/2024-2/alphabetical-option-list.html -ifeq ($(findstring -avxvnni,$(ARCH)),-avxvnni) - popcnt = yes - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes - avx2 = yes - avxvnni = yes - pext = yes -endif +### 2.1. Common options -ifeq ($(findstring -bmi2,$(ARCH)),-bmi2) - popcnt = yes - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes - avx2 = yes - pext = yes -endif +SF_CXXFLAGS := -std=c++17 -I. -Wall -DUSE_PTHREADS +SF_LDFLAGS := -ifeq ($(findstring -avx512,$(ARCH)),-avx512) - popcnt = yes - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes - avx2 = yes - pext = yes - avx512 = yes -endif +SF_LIBS := pthread -ifeq ($(findstring -vnni256,$(ARCH)),-vnni256) - popcnt = yes - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes - avx2 = yes - pext = yes - vnni256 = yes -endif +### 2.2. Compiler-specific options -ifeq ($(findstring -vnni512,$(ARCH)),-vnni512) - popcnt = yes - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes - avx2 = yes - pext = yes - avx512 = yes - vnni512 = yes +ifeq ($(COMP),gcc) + SF_CXXFLAGS += -pedantic -Wextra -Wcast-qual -Wmissing-declarations \ + -Wshadow +else ifeq ($(COMP),clang) + SF_CXXFLAGS += -pedantic -Wextra -Wcast-qual -Wconditional-uninitialized \ + -Wmissing-prototypes -Wshadow +else ifeq ($(COMP),icx) + SF_CXXFLAGS += -Wabi -Wmissing-declarations -Wmissing-prototypes -Wshadow endif -ifeq ($(sse),yes) - prefetch = yes -endif +### 2.3. Optimization options -# 64-bit pext is not available on x86-32 -ifeq ($(bits),32) - pext = no -endif +ifeq ($(optimize),yes) + SF_CXXFLAGS += -O3 + + ifeq ($(COMP),gcc) + SF_CXXFLAGS += -funroll-loops + ifeq ($(shell expr $(GCC_VERSION) \< 12),1) + SF_CXXFLAGS += -flto + SF_LDFLAGS += -flto + else + SF_CXXFLAGS += -flto=jobserver + SF_LDFLAGS += -flto=jobserver + endif + SF_CXXFLAGS += -flto-partition=one + SF_LDFLAGS += -flto-partition=one + else ifeq ($(COMP),clang) + SF_CXXFLAGS += -funroll-loops -flto=full + SF_LDFLAGS += -flto=full + ifeq ($(shell expr $(CLANG_VERSION) \< 16),1) + SF_CXXFLAGS += -fexperimental-new-pass-manager + endif + else ifeq ($(COMP),icx) + SF_CXXFLAGS += -flto=full + SF_LDFLAGS += -flto=full + endif +endif + +### 2.4. Debug options +ifeq ($(debug),no) + SF_CXXFLAGS += -DNDEBUG else - -# all other architectures - -ifeq ($(ARCH),general-32) - arch = any - bits = 32 -endif - -ifeq ($(ARCH),general-64) - arch = any -endif - -ifeq ($(ARCH),armv7) - arch = armv7 - prefetch = yes - bits = 32 - arm_version = 7 -endif - -ifeq ($(ARCH),armv7-neon) - arch = armv7 - prefetch = yes - popcnt = yes - neon = yes - bits = 32 - arm_version = 7 -endif - -ifeq ($(ARCH),armv8) - arch = armv8 - prefetch = yes - popcnt = yes - neon = yes - arm_version = 8 -endif - -ifeq ($(ARCH),armv8-dotprod) - arch = armv8 - prefetch = yes - popcnt = yes - neon = yes - dotprod = yes - arm_version = 8 -endif - -ifeq ($(ARCH),apple-silicon) - arch = arm64 - prefetch = yes - popcnt = yes - neon = yes - dotprod = yes - arm_version = 8 -endif - -ifeq ($(ARCH),ppc-32) - arch = ppc - bits = 32 -endif - -ifeq ($(ARCH),ppc-64) - arch = ppc64 - popcnt = yes - prefetch = yes -endif - -ifeq ($(findstring e2k,$(ARCH)),e2k) - arch = e2k - mmx = yes - bits = 64 - sse = yes - sse2 = yes - ssse3 = yes - sse41 = yes - popcnt = yes -endif - -ifeq ($(ARCH),riscv64) - arch = riscv64 -endif - -ifeq ($(ARCH),loongarch64) - arch = loongarch64 + SF_CXXFLAGS += -g endif -endif - -### ========================================================================== -### Section 3. Low-level Configuration -### ========================================================================== +### 2.5. Sanitizer options -### 3.1 Selecting compiler (default = gcc) -ifeq ($(MAKELEVEL),0) - export ENV_CXXFLAGS := $(CXXFLAGS) - export ENV_DEPENDFLAGS := $(DEPENDFLAGS) - export ENV_LDFLAGS := $(LDFLAGS) +ifneq ($(sanitize),none) + SF_CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize)) endif -CXXFLAGS = $(ENV_CXXFLAGS) -Wall -Wcast-qual -fno-exceptions -std=c++17 $(EXTRACXXFLAGS) -DEPENDFLAGS = $(ENV_DEPENDFLAGS) -std=c++17 -LDFLAGS = $(ENV_LDFLAGS) $(EXTRALDFLAGS) +### 2.6. Include Git commit hash and date -ifeq ($(COMP),) - COMP=gcc +GIT_SHA := $(shell git rev-parse --short=8 HEAD 2> /dev/null) +ifneq ($(GIT_SHA),) + SF_CXXFLAGS += -DGIT_SHA=$(GIT_SHA) endif -ifeq ($(COMP),gcc) - comp=gcc - CXX=g++ - CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations - - ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64)) - ifeq ($(OS),Android) - CXXFLAGS += -m$(bits) - LDFLAGS += -m$(bits) - endif - ifeq ($(ARCH),riscv64) - CXXFLAGS += -latomic - endif - else ifeq ($(ARCH),loongarch64) - CXXFLAGS += -latomic - else - CXXFLAGS += -m$(bits) - LDFLAGS += -m$(bits) - endif - - ifeq ($(arch),$(filter $(arch),armv7)) - LDFLAGS += -latomic - endif - - ifneq ($(KERNEL),Darwin) - LDFLAGS += -Wl,--no-as-needed - endif +GIT_DATE := $(shell git show -s --date=format:'%Y%m%d' --format=%cd HEAD 2> /dev/null) +ifneq ($(GIT_DATE),) + SF_CXXFLAGS += -DGIT_DATE=$(GIT_DATE) endif -ifeq ($(target_windows),yes) - LDFLAGS += -static -endif +### 2.7. Add flags based on target OS -ifeq ($(COMP),mingw) - comp=mingw - - ifeq ($(bits),64) - ifeq ($(shell which x86_64-w64-mingw32-c++-posix 2> /dev/null),) - CXX=x86_64-w64-mingw32-c++ - else - CXX=x86_64-w64-mingw32-c++-posix - endif - else - ifeq ($(shell which i686-w64-mingw32-c++-posix 2> /dev/null),) - CXX=i686-w64-mingw32-c++ - else - CXX=i686-w64-mingw32-c++-posix - endif - endif - CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-declarations +ifeq ($(MINGW),1) + SF_LDFLAGS += -static endif -ifeq ($(COMP),icx) - comp=icx - CXX=icpx - CXXFLAGS += --intel -pedantic -Wextra -Wshadow -Wmissing-prototypes \ - -Wconditional-uninitialized -Wabi -Wdeprecated -endif - -ifeq ($(COMP),clang) - comp=clang - CXX=clang++ - ifeq ($(target_windows),yes) - CXX=x86_64-w64-mingw32-clang++ - endif - - CXXFLAGS += -pedantic -Wextra -Wshadow -Wmissing-prototypes \ - -Wconditional-uninitialized - - ifeq ($(filter $(KERNEL),Darwin OpenBSD FreeBSD),) - ifeq ($(target_windows),) - ifneq ($(RTLIB),compiler-rt) - LDFLAGS += -latomic - endif - endif - endif - - ifeq ($(arch),$(filter $(arch),armv7 armv8 riscv64)) - ifeq ($(OS),Android) - CXXFLAGS += -m$(bits) - LDFLAGS += -m$(bits) - endif - ifeq ($(ARCH),riscv64) - CXXFLAGS += -latomic - endif - else ifeq ($(ARCH),loongarch64) - CXXFLAGS += -latomic - else - CXXFLAGS += -m$(bits) - LDFLAGS += -m$(bits) - endif -endif +endif # CXX_REQUIRED_RULES -ifeq ($(KERNEL),Darwin) - CXXFLAGS += -mmacosx-version-min=10.15 - LDFLAGS += -mmacosx-version-min=10.15 - ifneq ($(arch),any) - CXXFLAGS += -arch $(arch) - LDFLAGS += -arch $(arch) - endif - XCRUN = xcrun -endif +### 3. Add flags from architecture-specific Makefile +### Note that this section is not enclosed in the CXX_REQUIRED_RULES block; +### Users shall be able to see the help text even when there is no compiler. -# To cross-compile for Android, NDK version r21 or later is recommended. -# In earlier NDK versions, you'll need to pass -fno-addrsig if using GNU binutils. -# Currently we don't know how to make PGO builds with the NDK yet. -ifeq ($(COMP),ndk) - CXXFLAGS += -stdlib=libc++ -fPIE - comp=clang - ifeq ($(arch),armv7) - CXX=armv7a-linux-androideabi16-clang++ - CXXFLAGS += -mthumb -march=armv7-a -mfloat-abi=softfp -mfpu=neon - ifneq ($(shell which arm-linux-androideabi-strip 2>/dev/null),) - STRIP=arm-linux-androideabi-strip - else - STRIP=llvm-strip - endif - endif - ifeq ($(arch),armv8) - CXX=aarch64-linux-android21-clang++ - ifneq ($(shell which aarch64-linux-android-strip 2>/dev/null),) - STRIP=aarch64-linux-android-strip - else - STRIP=llvm-strip - endif - endif - ifeq ($(arch),x86_64) - CXX=x86_64-linux-android21-clang++ - ifneq ($(shell which x86_64-linux-android-strip 2>/dev/null),) - STRIP=x86_64-linux-android-strip - else - STRIP=llvm-strip - endif - endif - LDFLAGS += -static-libstdc++ -pie -lm -latomic +ifeq ($(ARCH),) + override ARCH := native endif -ifeq ($(comp),icx) - profile_make = icx-profile-make - profile_use = icx-profile-use -else ifeq ($(comp),clang) - profile_make = clang-profile-make - profile_use = clang-profile-use +ifeq ($(ARCH),native) + ARCH_NATIVE := y + SF_CXXFLAGS += -march=native -DARCH_NATIVE else - profile_make = gcc-profile-make - profile_use = gcc-profile-use - ifeq ($(KERNEL),Darwin) - EXTRAPROFILEFLAGS = -fvisibility=hidden - endif -endif + ifneq ($(filter x86%,$(ARCH)),) + ARCH_FAMILY := i386 + else ifneq ($(filter arm%,$(ARCH)),) + ARCH_FAMILY := arm + else + ARCH_FAMILY := generic + endif -### Allow overwriting CXX from command line -ifdef COMPCXX - CXX=$(COMPCXX) + include ./arch/$(ARCH_FAMILY)/Makefile endif -### Sometimes gcc is really clang -ifeq ($(COMP),gcc) - gccversion := $(shell $(CXX) --version 2>/dev/null) - gccisclang := $(findstring clang,$(gccversion)) - ifneq ($(gccisclang),) - profile_make = clang-profile-make - profile_use = clang-profile-use - endif -endif +export ARCH -### On mingw use Windows threads, otherwise POSIX -ifneq ($(comp),mingw) - CXXFLAGS += -DUSE_PTHREADS - # On Android Bionic's C library comes with its own pthread implementation bundled in - ifneq ($(OS),Android) - # Haiku has pthreads in its libroot, so only link it in on other platforms - ifneq ($(KERNEL),Haiku) - ifneq ($(COMP),ndk) - LDFLAGS += -lpthread - endif - endif - endif -endif +SF_CXXFLAGS += -DARCH=$(ARCH) -### 3.2.1 Debugging -ifeq ($(debug),no) - CXXFLAGS += -DNDEBUG -else - CXXFLAGS += -g -endif - -### 3.2.2 Debugging with undefined behavior sanitizers -ifneq ($(sanitize),none) - CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize)) - LDFLAGS += $(addprefix -fsanitize=,$(sanitize)) -endif +### 4. Extra flags for cross-compilation +### Information of target architecture is needed here. -### 3.3 Optimization -ifeq ($(optimize),yes) +ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),) - CXXFLAGS += -O3 -funroll-loops - - ifeq ($(comp),gcc) - ifeq ($(OS), Android) - CXXFLAGS += -fno-gcse -mthumb -march=armv7-a -mfloat-abi=softfp - endif - endif - - ifeq ($(KERNEL),Darwin) - ifeq ($(comp),$(filter $(comp),clang icx)) - CXXFLAGS += -mdynamic-no-pic - endif - - ifeq ($(comp),gcc) - ifneq ($(arch),arm64) - CXXFLAGS += -mdynamic-no-pic - endif - endif - endif - - ifeq ($(comp),clang) - clangmajorversion := $(shell $(CXX) -dumpversion 2>/dev/null | cut -f1 -d.) - ifeq ($(shell expr $(clangmajorversion) \< 16),1) - CXXFLAGS += -fexperimental-new-pass-manager - endif - endif +# Android NDK +ifneq ($(filter $(ARCH_FAMILY),i386 arm),) + ifeq ($(call test-compiler-macro,__ANDROID__),1) + SF_CXXFLAGS += -stdlib=libc++ -fPIE + SF_LDFLAGS += -static-libstdc++ -pie + SF_LIBS += m atomic + endif endif -### 3.4 Bits -ifeq ($(bits),64) - CXXFLAGS += -DIS_64BIT +# Link atomic library if not i386/arm family +ifneq ($(ARCH_NATIVE),y) + ifeq ($(filter $(ARCH_FAMILY),i386 arm),) + SF_LIBS += atomic + endif endif -### 3.5 prefetch and popcount -ifeq ($(prefetch),yes) - ifeq ($(sse),yes) - CXXFLAGS += -msse - endif -else - CXXFLAGS += -DNO_PREFETCH -endif +endif # CXX_REQUIRED_RULES +endif # MAKELEVEL=0 -ifeq ($(popcnt),yes) - ifeq ($(arch),$(filter $(arch),ppc64 armv7 armv8 arm64)) - CXXFLAGS += -DUSE_POPCNT - else - CXXFLAGS += -msse3 -mpopcnt -DUSE_POPCNT - endif -endif +SF_CXXFLAGS := $(strip $(SF_CXXFLAGS) $(CXXFLAGS)) +SF_LDFLAGS := $(strip $(SF_LDFLAGS) $(LDFLAGS)) +SF_LIBS := $(strip $(SF_LIBS) $(LIBS)) -### 3.6 SIMD architectures -ifeq ($(avx2),yes) - CXXFLAGS += -DUSE_AVX2 - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -mavx2 -mbmi - endif -endif +export SF_CXXFLAGS SF_LDFLAGS SF_LIBS -ifeq ($(avxvnni),yes) - CXXFLAGS += -DUSE_VNNI -DUSE_AVXVNNI - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -mavxvnni - endif -endif +### ========================================================================== -ifeq ($(avx512),yes) - CXXFLAGS += -DUSE_AVX512 - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -mavx512f -mavx512bw - endif -endif +define HELP_STRING +To see architecture-specific build options, run 'make help ARCH='. +Currently supported values: x86, arm, generic -ifeq ($(vnni256),yes) - CXXFLAGS += -DUSE_VNNI - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256 - endif -endif +How-to-build examples: -ifeq ($(vnni512),yes) - CXXFLAGS += -DUSE_VNNI - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -mavx512f -mavx512bw -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=512 - endif -endif + make profile-build -ifeq ($(sse41),yes) - CXXFLAGS += -DUSE_SSE41 - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -msse4.1 - endif -endif +Build Stockfish with profile-guided optimization (PGO) for the current +architecture. -ifeq ($(ssse3),yes) - CXXFLAGS += -DUSE_SSSE3 - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -mssse3 - endif -endif + make build ARCH=x86-64-avx2 CXX=clang++-18 -ifeq ($(sse2),yes) - CXXFLAGS += -DUSE_SSE2 - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -msse2 - endif -endif +Build Stockfish for the x86-64 architecture with AVX2/BMI2 support using clang++-18. -ifeq ($(mmx),yes) - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -mmmx - endif -endif +Check the Stockfish wiki for advanced build configuration. -ifeq ($(neon),yes) - CXXFLAGS += -DUSE_NEON=$(arm_version) - ifeq ($(KERNEL),Linux) - ifneq ($(COMP),ndk) - ifneq ($(arch),armv8) - CXXFLAGS += -mfpu=neon - endif - endif - endif -endif +endef +export HELP_STRING -ifeq ($(dotprod),yes) - CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_NEON_DOTPROD -endif +# Print how-to-build help text if architecture is not set, otherwise +# list all available build presets for the selected architecture. +ifneq ($(ARCH_NATIVE),y) -### 3.7 pext -ifeq ($(pext),yes) - CXXFLAGS += -DUSE_PEXT - ifeq ($(comp),$(filter $(comp),gcc clang mingw icx)) - CXXFLAGS += -mbmi2 - endif -endif +help: help-arch -### 3.8.1 Try to include git commit sha for versioning -GIT_SHA := $(shell git rev-parse HEAD 2>/dev/null | cut -c 1-8) -ifneq ($(GIT_SHA), ) - CXXFLAGS += -DGIT_SHA=$(GIT_SHA) -endif +config-sanity: config-sanity-arch -### 3.8.2 Try to include git commit date for versioning -GIT_DATE := $(shell git show -s --date=format:'%Y%m%d' --format=%cd HEAD 2>/dev/null) -ifneq ($(GIT_DATE), ) - CXXFLAGS += -DGIT_DATE=$(GIT_DATE) -endif +else -### 3.8.3 Try to include architecture -ifneq ($(ARCH), ) - CXXFLAGS += -DARCH=$(ARCH) -endif +help: + @echo "$${HELP_STRING}" -### 3.9 Link Time Optimization -### This is a mix of compile and link time options because the lto link phase -### needs access to the optimization flags. -ifeq ($(optimize),yes) -ifeq ($(debug), no) - ifeq ($(comp),$(filter $(comp),clang icx)) - CXXFLAGS += -flto=full - ifeq ($(comp),icx) - CXXFLAGS += -fwhole-program-vtables - endif - ifeq ($(target_windows),yes) - CXXFLAGS += -fuse-ld=lld - endif - LDFLAGS += $(CXXFLAGS) - -# GCC and CLANG use different methods for parallelizing LTO and CLANG pretends to be -# GCC on some systems. - else ifeq ($(comp),gcc) - ifeq ($(gccisclang),) - CXXFLAGS += -flto -flto-partition=one - LDFLAGS += $(CXXFLAGS) -flto=jobserver - else - CXXFLAGS += -flto=full - LDFLAGS += $(CXXFLAGS) - endif - -# To use LTO and static linking on Windows, -# the tool chain requires gcc version 10.1 or later. - else ifeq ($(comp),mingw) - CXXFLAGS += -flto -flto-partition=one - LDFLAGS += $(CXXFLAGS) -save-temps - endif -endif endif -### 3.10 Android 5 can only run position independent executables. Note that this -### breaks Android 4.0 and earlier. -ifeq ($(OS), Android) - CXXFLAGS += -fPIE - LDFLAGS += -fPIE -pie -endif +define CONFIG_SANITY_STRING -### ========================================================================== -### Section 4. Public Targets -### ========================================================================== +Build options: + optimize: $(optimize) + debug: $(debug) + sanitize: $(sanitize) -help: - @echo "" - @echo "To compile stockfish, type: " - @echo "" - @echo "make -j target [ARCH=arch] [COMP=compiler] [COMPCXX=cxx]" - @echo "" - @echo "Supported targets:" - @echo "" - @echo "help > Display architecture details" - @echo "profile-build > standard build with profile-guided optimization" - @echo "build > skip profile-guided optimization" - @echo "net > Download the default nnue nets" - @echo "strip > Strip executable" - @echo "install > Install executable" - @echo "clean > Clean up" - @echo "" - @echo "Supported archs:" - @echo "" - @echo "native > select the best architecture for the host processor (default)" - @echo "x86-64-vnni512 > x86 64-bit with vnni 512bit support" - @echo "x86-64-vnni256 > x86 64-bit with vnni 512bit support, limit operands to 256bit wide" - @echo "x86-64-avx512 > x86 64-bit with avx512 support" - @echo "x86-64-avxvnni > x86 64-bit with vnni 256bit support" - @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" - @echo "x86-64-avx2 > x86 64-bit with avx2 support" - @echo "x86-64-sse41-popcnt > x86 64-bit with sse41 and popcnt support" - @echo "x86-64-modern > deprecated, currently x86-64-sse41-popcnt" - @echo "x86-64-ssse3 > x86 64-bit with ssse3 support" - @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 compile and popcnt support" - @echo "x86-64 > x86 64-bit generic (with sse2 support)" - @echo "x86-32-sse41-popcnt > x86 32-bit with sse41 and popcnt support" - @echo "x86-32-sse2 > x86 32-bit with sse2 support" - @echo "x86-32 > x86 32-bit generic (with mmx compile support)" - @echo "ppc-64 > PPC 64-bit" - @echo "ppc-32 > PPC 32-bit" - @echo "armv7 > ARMv7 32-bit" - @echo "armv7-neon > ARMv7 32-bit with popcnt and neon" - @echo "armv8 > ARMv8 64-bit with popcnt and neon" - @echo "armv8-dotprod > ARMv8 64-bit with popcnt, neon and dot product support" - @echo "e2k > Elbrus 2000" - @echo "apple-silicon > Apple silicon ARM64" - @echo "general-64 > unspecified 64-bit" - @echo "general-32 > unspecified 32-bit" - @echo "riscv64 > RISC-V 64-bit" - @echo "loongarch64 > LoongArch 64-bit" - @echo "" - @echo "Supported compilers:" - @echo "" - @echo "gcc > GNU compiler (default)" - @echo "mingw > GNU compiler with MinGW under Windows" - @echo "clang > LLVM Clang compiler" - @echo "icx > Intel oneAPI DPC++/C++ Compiler" - @echo "ndk > Google NDK to cross-compile for Android" - @echo "" - @echo "Simple examples. If you don't know what to do, you likely want to run one of: " - @echo "" - @echo "make -j profile-build ARCH=x86-64-avx2 # typically a fast compile for common systems " - @echo "make -j profile-build ARCH=x86-64-sse41-popcnt # A more portable compile for 64-bit systems " - @echo "make -j profile-build ARCH=x86-64 # A portable compile for 64-bit systems " - @echo "" - @echo "Advanced examples, for experienced users: " - @echo "" - @echo "make -j profile-build ARCH=x86-64-avxvnni" - @echo "make -j profile-build ARCH=x86-64-avxvnni COMP=gcc COMPCXX=g++-12.0" - @echo "make -j build ARCH=x86-64-ssse3 COMP=clang" - @echo "" -ifneq ($(SUPPORTED_ARCH), true) - @echo "Specify a supported architecture with the ARCH option for more details" - @echo "" -endif +Compiler options: + CXX: $(CXX) + CXXFLAGS: $(SF_CXXFLAGS) + LDFLAGS: $(SF_LDFLAGS) $(SF_LIBS:%=-l%) +endef +export CONFIG_SANITY_STRING -.PHONY: help analyze build profile-build strip install clean net \ - objclean profileclean config-sanity \ - icx-profile-use icx-profile-make \ - gcc-profile-use gcc-profile-make \ - clang-profile-use clang-profile-make FORCE \ - format analyze +config-sanity: net + @[ "$(optimize)" = "yes" -o "$(optimize)" = "no" ] + @[ "$(debug)" = "yes" -o "$(debug)" = "no" ] + @[ ! -z "$(sanitize)" ] + @echo "$${CONFIG_SANITY_STRING}" -analyze: net config-sanity objclean - $(MAKE) -k ARCH=$(ARCH) COMP=$(COMP) $(OBJS) +analyze: config-sanity objclean + @$(MAKE) -k --no-print-directory CXXFLAGS="" LDFLAGS="" $(OBJS) -build: net config-sanity - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) all +build: config-sanity + @$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" all -profile-build: net config-sanity objclean profileclean - @echo "" +profile-build: config-sanity objclean profileclean @echo "Step 1/4. Building instrumented executable ..." - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make) - @echo "" - @echo "Step 2/4. Running benchmark for pgo-build ..." - $(PGOBENCH) > PGOBENCH.out 2>&1 - tail -n 4 PGOBENCH.out - @echo "" - @echo "Step 3/4. Building optimized executable ..." - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) objclean - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_use) - @echo "" - @echo "Step 4/4. Deleting profile data ..." - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) profileclean - -strip: - $(STRIP) $(EXE) - -install: - -mkdir -p -m 755 $(BINDIR) - -cp $(EXE) $(BINDIR) - $(STRIP) $(BINDIR)/$(EXE) - -# clean all -clean: objclean profileclean - @rm -f .depend *~ core - -# clean binaries and objects -objclean: - @rm -f stockfish stockfish.exe *.o ./syzygy/*.o ./nnue/*.o ./nnue/features/*.o - -# clean auxiliary profiling files -profileclean: - @rm -rf profdir - @rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s PGOBENCH.out - @rm -f stockfish.profdata *.profraw - @rm -f stockfish.*args* - @rm -f stockfish.*lt* - @rm -f stockfish.res - @rm -f ./-lstdc++.res - -# evaluation network (nnue) -net: - @$(SHELL) ../scripts/net.sh - -format: - $(CLANG-FORMAT) -i $(SRCS) $(HEADERS) -style=file - -# default target -default: - help - -### ========================================================================== -### Section 5. Private Targets -### ========================================================================== + @$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_make) + @printf "\n%s\n" "Step 2/4. Running benchmark for pgo-build ..." + @$(EMULATE) ./$(EXE) bench > PGOBENCH.out 2>&1 + @tail -n 4 PGOBENCH.out + @printf "\n%s\n" "Step 3/4. Building optimized executable ..." + @$(MAKE) --no-print-directory objclean + @$(MAKE) --no-print-directory CXXFLAGS="" LDFLAGS="" $(profile_use) + @printf "\n%s\n" "Step 4/4. Deleting profile data ..." + @$(MAKE) --no-print-directory profileclean all: $(EXE) .depend -config-sanity: net - @echo "" - @echo "Config:" - @echo "debug: '$(debug)'" - @echo "sanitize: '$(sanitize)'" - @echo "optimize: '$(optimize)'" - @echo "arch: '$(arch)'" - @echo "bits: '$(bits)'" - @echo "kernel: '$(KERNEL)'" - @echo "os: '$(OS)'" - @echo "prefetch: '$(prefetch)'" - @echo "popcnt: '$(popcnt)'" - @echo "pext: '$(pext)'" - @echo "sse: '$(sse)'" - @echo "mmx: '$(mmx)'" - @echo "sse2: '$(sse2)'" - @echo "ssse3: '$(ssse3)'" - @echo "sse41: '$(sse41)'" - @echo "avx2: '$(avx2)'" - @echo "avxvnni: '$(avxvnni)'" - @echo "avx512: '$(avx512)'" - @echo "vnni256: '$(vnni256)'" - @echo "vnni512: '$(vnni512)'" - @echo "neon: '$(neon)'" - @echo "dotprod: '$(dotprod)'" - @echo "arm_version: '$(arm_version)'" - @echo "target_windows: '$(target_windows)'" - @echo "" - @echo "Flags:" - @echo "CXX: $(CXX)" - @echo "CXXFLAGS: $(CXXFLAGS)" - @echo "LDFLAGS: $(LDFLAGS)" - @echo "" - @echo "Testing config sanity. If this fails, try 'make help' ..." - @echo "" - @test "$(debug)" = "yes" || test "$(debug)" = "no" - @test "$(optimize)" = "yes" || test "$(optimize)" = "no" - @test "$(SUPPORTED_ARCH)" = "true" - @test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \ - test "$(arch)" = "ppc64" || test "$(arch)" = "ppc" || test "$(arch)" = "e2k" || \ - test "$(arch)" = "armv7" || test "$(arch)" = "armv8" || test "$(arch)" = "arm64" || test "$(arch)" = "riscv64" || test "$(arch)" = "loongarch64" - @test "$(bits)" = "32" || test "$(bits)" = "64" - @test "$(prefetch)" = "yes" || test "$(prefetch)" = "no" - @test "$(popcnt)" = "yes" || test "$(popcnt)" = "no" - @test "$(pext)" = "yes" || test "$(pext)" = "no" - @test "$(sse)" = "yes" || test "$(sse)" = "no" - @test "$(mmx)" = "yes" || test "$(mmx)" = "no" - @test "$(sse2)" = "yes" || test "$(sse2)" = "no" - @test "$(ssse3)" = "yes" || test "$(ssse3)" = "no" - @test "$(sse41)" = "yes" || test "$(sse41)" = "no" - @test "$(avx2)" = "yes" || test "$(avx2)" = "no" - @test "$(avx512)" = "yes" || test "$(avx512)" = "no" - @test "$(vnni256)" = "yes" || test "$(vnni256)" = "no" - @test "$(vnni512)" = "yes" || test "$(vnni512)" = "no" - @test "$(neon)" = "yes" || test "$(neon)" = "no" - @test "$(comp)" = "gcc" || test "$(comp)" = "icx" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \ - || test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang" - $(EXE): $(OBJS) - +$(CXX) -o $@ $(OBJS) $(LDFLAGS) + +$(CXX) $(SF_LDFLAGS) -o $@ $(OBJS) $(SF_LIBS:%=-l%) + +%.o: %.cpp + +$(CXX) $(SF_CXXFLAGS) -c -o $@ $< # Force recompilation to ensure version info is up-to-date misc.o: FORCE FORCE: -clang-profile-make: - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-generate ' \ - EXTRALDFLAGS=' -fprofile-generate' \ - all - -clang-profile-use: - $(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-use=stockfish.profdata' \ - EXTRALDFLAGS='-fprofile-use ' \ - all +.depend: $(SRCS) + -@$(CXX) $(SF_CXXFLAGS) -MM $(SRCS) > $@ && \ + printf "%s\n\n" "Dependency updated, restarting Make..." gcc-profile-make: @mkdir -p profdir - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-generate=profdir' \ - EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \ - EXTRALDFLAGS='-lgcov' \ - all + @$(MAKE) --no-print-directory \ + CXXFLAGS="-fprofile-generate=profdir" LDFLAGS="" LIBS="gcov" all gcc-profile-use: - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-use=profdir -fno-peel-loops -fno-tracer' \ - EXTRACXXFLAGS+=$(EXTRAPROFILEFLAGS) \ - EXTRALDFLAGS='-lgcov' \ - all + @$(MAKE) --no-print-directory \ + CXXFLAGS="-fprofile-use=profdir -fno-peel-loops -fno-tracer" LDFLAGS="" LIBS="gcov" all + +clang-profile-make: + @$(MAKE) --no-print-directory \ + CXXFLAGS="-fprofile-generate" LDFLAGS="-fprofile-generate" all + +clang-profile-use: + $(XCRUN) $(LLVM_PROFDATA) merge -output=stockfish.profdata *.profraw + @$(MAKE) --no-print-directory \ + CXXFLAGS="-fprofile-use=stockfish.profdata" \ + LDFLAGS="-fprofile-use=stockfish.profdata" \ + all icx-profile-make: - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-instr-generate ' \ - EXTRALDFLAGS=' -fprofile-instr-generate' \ - all + @$(MAKE) --no-print-directory \ + CXXFLAGS="-fprofile-instr-generate" LDFLAGS="-fprofile-instr-generate" all icx-profile-use: - $(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw - $(MAKE) ARCH=$(ARCH) COMP=$(COMP) \ - EXTRACXXFLAGS='-fprofile-instr-use=stockfish.profdata' \ - EXTRALDFLAGS='-fprofile-use ' \ - all + @$(XCRUN) llvm-profdata merge -output=stockfish.profdata *.profraw + @$(MAKE) --no-print-directory \ + CXXFLAGS="-fprofile-instr-use=stockfish.profdata" LDFLAGS="-fprofile-use" all -.depend: $(SRCS) - -@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null - -ifeq (, $(filter $(MAKECMDGOALS), help strip install clean net objclean profileclean config-sanity)) +ifneq ($(filter $(MAKECMDGOALS),$(CXX_REQUIRED_RULES)),) -include .depend -endif +endif diff --git a/src/arch/.clang-format b/src/arch/.clang-format new file mode 100644 index 00000000000..50b996b1a8a --- /dev/null +++ b/src/arch/.clang-format @@ -0,0 +1,5 @@ +BasedOnStyle: InheritParentConfig + +# Architecture specific files use a lot of preprocessor directives. +# Do not indent them for better readability. +IndentPPDirectives: None diff --git a/src/arch/arm/Makefile b/src/arch/arm/Makefile new file mode 100644 index 00000000000..6f33d3e1576 --- /dev/null +++ b/src/arch/arm/Makefile @@ -0,0 +1,43 @@ +define HELP_STRING_ARM +To build Stockfish, run the following command: + +make [ARCH=] [CPU=] + +Build presets for ARM/AArch64 architecture: + +armv8-dotprod > ARMv8.2-A 64-bit, Neon with DotProd feature +armv8 > ARMv8-A 64-bit, Neon (Advanced SIMD) +armv7-neon > ARMv7-A 32-bit, Neon (Advanced SIMD) +armv7 > ARMv7-A 32-bit, no SIMD support + +Stockfish does not support AArch32 targets. +Stockfish does not support non-A profile architectures. + +endef +export HELP_STRING_ARM + +ifneq ($(filter $(COMP),gcc clang icx),) + +ifeq ($(ARCH),armv8-dotprod) + SF_CXXFLAGS += -march=armv8.2-a+dotprod +else ifeq ($(ARCH),armv8) + SF_CXXFLAGS += -march=armv8-a +else ifeq ($(ARCH),armv7-neon) + SF_CXXFLAGS += -march=armv7-a -mfpu=neon -mfloat-abi=softfp +else ifeq ($(ARCH),armv7) + SF_CXXFLAGS += -march=armv7-a +endif + +endif # gcc clang icx + +.PHONY: help-arch config-sanity-arch + +help-arch: + @echo "$${HELP_STRING_ARM}" + +config-sanity-arch: + @[ "$(ARCH)" = "armv8-dotprod" -o \ + "$(ARCH)" = "armv8" -o \ + "$(ARCH)" = "armv7-neon" -o \ + "$(ARCH)" = "armv7" \ + ] diff --git a/src/arch/arm/arch.h b/src/arch/arm/arch.h new file mode 100644 index 00000000000..30f50451aaf --- /dev/null +++ b/src/arch/arm/arch.h @@ -0,0 +1,119 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef ARM_ARCH_H_INCLUDED +#define ARM_ARCH_H_INCLUDED + +#if !defined(__arm__) && !defined(__aarch64__) && __ARM_ARCH_PROFILE != 'A' +#error "Not supported in the current architecture." +#endif + +#if __ARM_ARCH >= 8 && (!defined(__ARM_64BIT_STATE) || !defined(__ARM_NEON)) +#error "Invalid AArch64 state." +#endif + +#include +#include +#include + +#include "common.h" + +#include + +#ifdef __ARM_NEON + +#include + +namespace Stockfish { + +template +inline int __neon_cnt(T n) { + static_assert(std::is_integral_v && sizeof(T) <= 8); + + uint8x8_t cnt = vcnt_u8(vcreate_u8(std::uint64_t(n))); + +#if __ARM_ARCH >= 8 + return vaddv_u8(cnt); +#else + return vget_lane_u64(vpaddl_u32(vpaddl_u16(vpaddl_u8(cnt))), 0); +#endif +} + +inline void vdotq_s32_v(int32x4_t& acc, int8x16_t in, int8x16_t col) { +#ifdef __ARM_FEATURE_DOTPROD + acc = vdotq_s32(acc, in, col); +#elif __ARM_ARCH >= 8 + int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col)); + int16x8_t product1 = vmull_high_s8(in, col); + int16x8_t sum = vpaddq_s16(product0, product1); + acc = vpadalq_s16(acc, sum); +#else + int16x8_t product0 = vmull_s8(vget_low_s8(in), vget_low_s8(col)); + int16x8_t product1 = vmull_s8(vget_high_s8(in), vget_high_s8(col)); + int16x8_t sum = + vcombine_s16(vqmovn_s32(vpaddlq_s16(product0)), vqmovn_s32(vpaddlq_s16(product1))); + acc = vpadalq_s16(acc, sum); +#endif +} + +} // namespace Stockfish + +#endif // __ARM_NEON + +namespace Stockfish { + +inline constexpr bool ArchImpl::Is64Bit = __ARM_ARCH >= 8; +inline constexpr bool ArchImpl::UsePEXT = false; + +template +inline void ArchImpl::prefetch([[maybe_unused]] const void* m) {} + +template +inline unsigned int ArchImpl::popcount(T n) { + static_assert(std::is_integral_v && sizeof(T) <= 8); + +#ifdef __ARM_NEON + return __neon_cnt(n); +#else + return __popcount_value(n); +#endif +} + +template +inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) { + return 0; +} + +// =========================================================================== +// The functions below are used on ARM/AArch64 targets only. +// =========================================================================== + +template +inline int ctz(T n) { + static_assert(std::is_integral_v && (sizeof(T) == 4 || sizeof(T) == 8)); + assert(n != 0); + + if constexpr (sizeof(T) == 8) + return __clzll(__rbitll(std::uint64_t(n))); + else + return __clz(__rbit(std::uint32_t(n))); +} + +} // namespace Stockfish + +#endif // ARM_ARCH_H_INCLUDED diff --git a/src/arch/arm/nnue/layers/affine_transform.h b/src/arch/arm/nnue/layers/affine_transform.h new file mode 100644 index 00000000000..3fb69bce2d8 --- /dev/null +++ b/src/arch/arm/nnue/layers/affine_transform.h @@ -0,0 +1,106 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#define ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED + +#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +// Check ARM/AArch64 SIMD features. +// If none is defined, fall back to the generic implementation. +#ifndef __ARM_NEON + +#include "arch/generic/nnue/layers/affine_transform.h" + +#else + +#include "../../arch.h" + +#include +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +template +constexpr IndexType AffineTransform::get_weight_index(IndexType i) { + return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + + i / PaddedInputDimensions * 4 + i % 4; +} + +template +void AffineTransform::propagate(const InputType* input, OutputType* output) const { + if constexpr (OutputDimensions > 1) + { + static constexpr IndexType OutputLanes = 16 / sizeof(OutputType); + static_assert(OutputDimensions % OutputLanes == 0); + + static constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; + static constexpr IndexType NumRegs = OutputDimensions / OutputLanes; + + int32x4_t acc[NumRegs]; + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = reinterpret_cast(biases)[k]; + + for (IndexType i = 0; i < NumChunks; ++i) + { + const int8x16_t in = + vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast(input)[i])); + const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]); + + for (std::size_t k = 0; k < array_size(acc); ++k) + vdotq_s32_v(acc[k], in, col[k]); + } + + for (std::size_t k = 0; k < array_size(acc); ++k) + reinterpret_cast(output)[k] = acc[k]; + } + else if constexpr (OutputDimensions == 1) + { + static constexpr IndexType InputLanes = 16 / sizeof(InputType); + static_assert(PaddedInputDimensions % InputLanes == 0); + + static constexpr IndexType NumChunks = PaddedInputDimensions / InputLanes; + + int32x4_t sum = vdupq_n_s32(0); + + for (IndexType j = 0; j < NumChunks; ++j) + { + const int8x16_t in = reinterpret_cast(input)[j]; + const int8x16_t row = reinterpret_cast(weights)[j]; + vdotq_s32_v(sum, in, row); + } + +#if __ARM_ARCH >= 8 + output[0] = vaddvq_s32(sum) + biases[0]; +#else + output[0] = vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) + + vgetq_lane_s32(sum, 3) + biases[0]; +#endif + } +} + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // !__ARM_NEON + +#endif // ARM_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED diff --git a/src/arch/arm/nnue/layers/affine_transform_sparse_input.h b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h new file mode 100644 index 00000000000..546002c256f --- /dev/null +++ b/src/arch/arm/nnue/layers/affine_transform_sparse_input.h @@ -0,0 +1,129 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED +#define ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED + +#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +#include "../../arch.h" + +#include +#include +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +#if __ARM_ARCH >= 8 + +alignas(CacheLineSize) static const std::array, 256> lookupIndices = + [] { + std::array, 256> array{}; + for (std::uint64_t i = 0; i < 256; ++i) + { + std::uint64_t j = i, k = 0; + while (j) + array[i][k++] = ctz(j), j &= j - 1; + } + return array; + }(); + +template +class AffineTransformSparseInput: public AffineTransform { + __DEFINE_BASE_PROPERTIES + + static_assert(OutputDimensions % 16 == 0, + "OutputDimensions must be multiple of 16 for this layer."); + + public: + void propagate(const InputType* input, OutputType* output) const; + + private: + template + static IndexType populate_nz_indices(const std::uint8_t* input, std::uint16_t* indices) { + IndexType count = 0; + uint16x8_t base = vdupq_n_u16(0); + + const auto in = reinterpret_cast(input); + + for (IndexType i = 0; i < InputDimensions / 8; ++i) + { + const int32x4_t chunk0 = in[i * 2]; + const int32x4_t chunk1 = in[i * 2 + 1]; + + static const uint32x4_t movemask = [] { + const std::uint32_t n[4] = {1, 2, 4, 8}; + return vld1q_u32(n); + }(); + + const std::uint32_t nnz = vaddvq_u32(vandq_u32(vtstq_s32(chunk0, chunk0), movemask)) + | vaddvq_u32(vandq_u32(vtstq_s32(chunk1, chunk1), movemask)) + << 4; + const uint16x8_t offsets = *reinterpret_cast(&lookupIndices[nnz]); + *reinterpret_cast(indices + count) = vaddq_u16(base, offsets); + count += popcount(nnz); + base = vaddq_u16(base, vdupq_n_u16(8)); + } + + return count; + } +}; + +template +void AffineTransformSparseInput::propagate(const InputType* input, + OutputType* output) const { + static constexpr IndexType OutputLanes = 16 / sizeof(OutputType); + + static constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; + static constexpr IndexType NumRegs = OutputDimensions / OutputLanes; + + int32x4_t acc[NumRegs]; + std::uint16_t nnz[NumChunks]; + IndexType count = populate_nz_indices(input, nnz); + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = reinterpret_cast(biases)[k]; + + for (IndexType j = 0; j < count; ++j) + { + const auto i = nnz[j]; + const int8x16_t in = + vreinterpretq_s8_s32(vdupq_n_s32(reinterpret_cast(input)[i])); + const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]); + for (std::size_t k = 0; k < array_size(acc); ++k) + vdotq_s32_v(acc[k], in, col[k]); + } + + for (std::size_t k = 0; k < array_size(acc); ++k) + reinterpret_cast(output)[k] = acc[k]; +} + +#else + +template +using AffineTransformSparseInput = AffineTransform; + +#endif // __ARM_ARCH >= 8 + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // ARM_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED diff --git a/src/arch/arm/nnue/layers/clipped_relu.h b/src/arch/arm/nnue/layers/clipped_relu.h new file mode 100644 index 00000000000..7e644ec5060 --- /dev/null +++ b/src/arch/arm/nnue/layers/clipped_relu.h @@ -0,0 +1,72 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED +#define ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED + +#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +// Check ARM/AArch64 SIMD features. +// If none is defined, fall back to the generic implementation. +#ifndef __ARM_NEON + +#include "arch/generic/nnue/layers/clipped_relu.h" + +#else + +#include "../../arch.h" + +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +template +void ClippedReLU::propagate(const InputType* input, OutputType* output) const { + static constexpr IndexType NumChunks = ceil_to_multiple(OutputDimensions, 16) / 8; + + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast(output); + + for (IndexType i = 0; i < NumChunks; ++i) + { +#if __ARM_ARCH >= 8 + int16x4_t words0 = vqshrn_n_s32(in[i * 2], WeightScaleBits); + int16x8_t words = vqshrn_high_n_s32(words0, in[i * 2 + 1], WeightScaleBits); + out[i] = vmax_s8(vqmovn_s16(words), vdup_n_s8(0)); +#else + union { + int16x4x2_t tuple; + int16x8_t all; + } words; + + words.tuple.val[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits); + words.tuple.val[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits); + out[i] = vmax_s8(vqmovn_s16(words.all), vdup_n_s8(0)); +#endif + } +} + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // !__ARM_NEON + +#endif // ARM_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED diff --git a/src/arch/arm/nnue/layers/sqr_clipped_relu.h b/src/arch/arm/nnue/layers/sqr_clipped_relu.h new file mode 100644 index 00000000000..f67388c6fac --- /dev/null +++ b/src/arch/arm/nnue/layers/sqr_clipped_relu.h @@ -0,0 +1,29 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED +#define ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED + +#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +// lazy +#include "arch/generic/nnue/layers/sqr_clipped_relu.h" + +#endif // ARM_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED diff --git a/src/arch/arm/nnue/nnue_feature_transformer.h b/src/arch/arm/nnue/nnue_feature_transformer.h new file mode 100644 index 00000000000..fe0b077f008 --- /dev/null +++ b/src/arch/arm/nnue/nnue_feature_transformer.h @@ -0,0 +1,343 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED +#define ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED + +#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +// Check ARM/AArch64 SIMD features. +// If none is defined, fall back to the generic implementation. +#ifndef __ARM_NEON + +#include "arch/generic/nnue/nnue_feature_transformer.h" + +#else + +#include "../arch.h" + +#include +#include + +#include "misc.h" +#include "position.h" +#include "types.h" +#include "nnue/nnue_accumulator.h" +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE { + +template StateInfo::*accPtr> +struct FeatureTransformer::Details { + private: + static constexpr int NumQReg = 16; + + public: + static constexpr int OptimalAccRegisterCount = + optimal_register_count<16, NumQReg, sizeof(WeightType), TransformedFeatureDimensions>(); + static constexpr int OptimalPSQTRegisterCount = + optimal_register_count<16, NumQReg, sizeof(PSQTWeightType), PSQTBuckets>(); + + static constexpr IndexType TileHeight = OptimalAccRegisterCount * 16 / sizeof(WeightType); + static constexpr IndexType PsqtTileHeight = + OptimalPSQTRegisterCount * 16 / sizeof(PSQTWeightType); + + static_assert(HalfDimensions % TileHeight == 0, + "HalfDimensions must be multiple of TileHeight"); + static_assert(PSQTBuckets % PsqtTileHeight == 0, + "PSQTBuckets must be multiple of PsqtTileHeight"); +}; + +template StateInfo::*accPtr> +template +void FeatureTransformer::permute_weights() {} + +template StateInfo::*accPtr> +template +void FeatureTransformer:: + apply_accumulator_updates_incremental(StateInfo* computed, + StateInfo* next, + FeatureSet::IndexList& removed, + FeatureSet::IndexList& added) const { + // The most common case when updating the accumulator incrementally. + // Calculates feature differences directly without using tiling mechanism. + if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1) + { + const auto accIn = + reinterpret_cast(&(computed->*accPtr).accumulation[Perspective][0]); + const auto accOut = + reinterpret_cast(&(next->*accPtr).accumulation[Perspective][0]); + + const IndexType offsetR0 = HalfDimensions * removed[0]; + const auto columnR0 = reinterpret_cast(&weights[offsetR0]); + const IndexType offsetA = HalfDimensions * added[0]; + const auto columnA = reinterpret_cast(&weights[offsetA]); + + if (removed.size() == 1) + { + for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / 16; ++i) + accOut[i] = vaddq_s16(vsubq_s16(accIn[i], columnR0[i]), columnA[i]); + } + else + { + const IndexType offsetR1 = HalfDimensions * removed[1]; + const auto columnR1 = reinterpret_cast(&weights[offsetR1]); + + for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / 16; ++i) + accOut[i] = + vsubq_s16(vaddq_s16(accIn[i], columnA[i]), vaddq_s16(columnR0[i], columnR1[i])); + } + + const auto accPsqtIn = + reinterpret_cast(&(computed->*accPtr).psqtAccumulation[Perspective][0]); + const auto accPsqtOut = + reinterpret_cast(&(next->*accPtr).psqtAccumulation[Perspective][0]); + + const IndexType offsetPsqtR0 = PSQTBuckets * removed[0]; + auto columnPsqtR0 = reinterpret_cast(&psqtWeights[offsetPsqtR0]); + const IndexType offsetPsqtA = PSQTBuckets * added[0]; + auto columnPsqtA = reinterpret_cast(&psqtWeights[offsetPsqtA]); + + if (removed.size() == 1) + { + for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / 16; ++i) + accPsqtOut[i] = vaddq_s32(vsubq_s32(accPsqtIn[i], columnPsqtR0[i]), columnPsqtA[i]); + } + else + { + const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; + const auto columnPsqtR1 = + reinterpret_cast(&psqtWeights[offsetPsqtR1]); + + for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / 16; ++i) + accPsqtOut[i] = vsubq_s32(vaddq_s32(accPsqtIn[i], columnPsqtA[i]), + vaddq_s32(columnPsqtR0[i], columnPsqtR1[i])); + } + } + else + { + int16x8_t acc[Details::OptimalAccRegisterCount]; + + for (IndexType i = 0; i < HalfDimensions / Details::TileHeight; ++i) + { + const IndexType offsetRow = i * Details::TileHeight; + + const auto accTileIn = reinterpret_cast( + &(computed->*accPtr).accumulation[Perspective][offsetRow]); + for (std::size_t j = 0; j < array_size(acc); ++j) + acc[j] = accTileIn[j]; + + for (const auto index : removed) + { + const IndexType offset = HalfDimensions * index + offsetRow; + const auto column = reinterpret_cast(&weights[offset]); + for (std::size_t j = 0; j < array_size(acc); ++j) + acc[j] = vsubq_s16(acc[j], column[j]); + } + + for (const auto index : added) + { + const IndexType offset = HalfDimensions * index + offsetRow; + const auto column = reinterpret_cast(&weights[offset]); + for (std::size_t j = 0; j < array_size(acc); ++j) + acc[j] = vaddq_s16(acc[j], column[j]); + } + + const auto accTileOut = + reinterpret_cast(&(next->*accPtr).accumulation[Perspective][offsetRow]); + for (std::size_t j = 0; j < array_size(acc); ++j) + accTileOut[j] = acc[j]; + } + + int32x4_t psqt[Details::OptimalPSQTRegisterCount]; + + for (IndexType i = 0; i < PSQTBuckets / Details::PsqtTileHeight; ++i) + { + const IndexType offsetRow = i * Details::PsqtTileHeight; + + auto accTilePsqtIn = reinterpret_cast( + &(computed->*accPtr).psqtAccumulation[Perspective][offsetRow]); + for (std::size_t j = 0; j < array_size(psqt); ++j) + psqt[j] = accTilePsqtIn[j]; + + for (const auto index : removed) + { + const IndexType offset = PSQTBuckets * index + offsetRow; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t j = 0; j < array_size(psqt); ++j) + psqt[j] = vsubq_s32(psqt[j], columnPsqt[j]); + } + + for (const auto index : added) + { + const IndexType offset = PSQTBuckets * index + offsetRow; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t j = 0; j < array_size(psqt); ++j) + psqt[j] = vaddq_s32(psqt[j], columnPsqt[j]); + } + + auto accTilePsqtOut = reinterpret_cast( + &(next->*accPtr).psqtAccumulation[Perspective][offsetRow]); + for (std::size_t j = 0; j < array_size(psqt); ++j) + accTilePsqtOut[j] = psqt[j]; + } + } +} + +template StateInfo::*accPtr> +template +void FeatureTransformer:: + apply_accumulator_updates_refresh_cache( + Accumulator& accumulator, + typename AccumulatorCaches::Cache::Entry& entry, + FeatureSet::IndexList removed, + FeatureSet::IndexList added) const { + int16x8_t acc[Details::OptimalAccRegisterCount]; + + for (IndexType j = 0; j < HalfDimensions / Details::TileHeight; ++j) + { + const IndexType offsetRow = j * Details::TileHeight; + + const auto accTile = + reinterpret_cast(&accumulator.accumulation[Perspective][offsetRow]); + const auto entryTile = reinterpret_cast(&entry.accumulation[offsetRow]); + + for (IndexType k = 0; k < array_size(acc); ++k) + acc[k] = entryTile[k]; + + std::size_t i = 0; + for (; i < std::min(removed.size(), added.size()); ++i) + { + const IndexType offsetR = HalfDimensions * removed[i] + offsetRow; + const auto columnR = reinterpret_cast(&weights[offsetR]); + const IndexType offsetA = HalfDimensions * added[i] + offsetRow; + const auto columnA = reinterpret_cast(&weights[offsetA]); + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = vaddq_s16(acc[k], vsubq_s16(columnA[k], columnR[k])); + } + for (; i < removed.size(); ++i) + { + const IndexType offset = HalfDimensions * removed[i] + offsetRow; + const auto column = reinterpret_cast(&weights[offset]); + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = vsubq_s16(acc[k], column[k]); + } + for (; i < added.size(); ++i) + { + const IndexType offset = HalfDimensions * added[i] + offsetRow; + const auto column = reinterpret_cast(&weights[offset]); + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = vaddq_s16(acc[k], column[k]); + } + + for (std::size_t k = 0; k < array_size(acc); k++) + entryTile[k] = acc[k]; + for (std::size_t k = 0; k < array_size(acc); k++) + accTile[k] = acc[k]; + } + + int32x4_t psqt[Details::OptimalPSQTRegisterCount]; + + for (IndexType j = 0; j < PSQTBuckets / Details::PsqtTileHeight; ++j) + { + const IndexType offsetRow = j * Details::PsqtTileHeight; + + const auto accTilePsqt = + reinterpret_cast(&accumulator.psqtAccumulation[Perspective][offsetRow]); + const auto entryTilePsqt = reinterpret_cast(&entry.psqtAccumulation[offsetRow]); + + for (std::size_t k = 0; k < array_size(psqt); ++k) + psqt[k] = entryTilePsqt[k]; + + for (std::size_t i = 0; i < removed.size(); ++i) + { + const IndexType offset = PSQTBuckets * removed[i] + offsetRow; + const auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + + for (std::size_t k = 0; k < array_size(psqt); ++k) + psqt[k] = vsubq_s32(psqt[k], columnPsqt[k]); + } + for (std::size_t i = 0; i < added.size(); ++i) + { + const IndexType offset = PSQTBuckets * added[i] + offsetRow; + const auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + + for (std::size_t k = 0; k < array_size(psqt); ++k) + psqt[k] = vaddq_s32(psqt[k], columnPsqt[k]); + } + + for (std::size_t k = 0; k < array_size(psqt); ++k) + entryTilePsqt[k] = psqt[k]; + for (std::size_t k = 0; k < array_size(psqt); ++k) + accTilePsqt[k] = psqt[k]; + } +} + +template StateInfo::*accPtr> +void FeatureTransformer::convert_accumulators( + const Position& pos, OutputType* output) const { + static constexpr IndexType OutputChunkSize = 16 / sizeof(OutputType); + static_assert((HalfDimensions / 2) % OutputChunkSize == 0); + + static constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; + + const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; + const auto& accumulation = (pos.state()->*accPtr).accumulation; + + for (IndexType p = 0; p < 2; ++p) + { + const auto in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); + const auto in1 = + reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); + const auto out = reinterpret_cast(&output[(HalfDimensions / 2) * p]); + + for (IndexType j = 0; j < NumOutputChunks; ++j) + { + static const int16x8_t Zeroes = vdupq_n_s16(0); + static const int16x8_t Ones = vdupq_n_s16(127 * 2); + + const int16x8_t sum0a = + vshlq_n_s16(vmaxq_s16(vminq_s16(in0[j * 2 + 0], Ones), Zeroes), 6); + const int16x8_t sum0b = + vshlq_n_s16(vmaxq_s16(vminq_s16(in0[j * 2 + 1], Ones), Zeroes), 6); + const int16x8_t sum1a = vminq_s16(in1[j * 2 + 0], Ones); + const int16x8_t sum1b = vminq_s16(in1[j * 2 + 1], Ones); + + const int16x8_t pa = vqdmulhq_s16(sum0a, sum1a); + const int16x8_t pb = vqdmulhq_s16(sum0b, sum1b); + + out[j] = vcombine_u8(vqmovun_s16(pa), vqmovun_s16(pb)); + } + } +} + +} // namespace Stockfish::Eval::NNUE + +#endif // !__ARM_NEON + +#endif // ARM_NNUE_FEATURE_TRANSFORMER_H_INCLUDED diff --git a/src/arch/generic/Makefile b/src/arch/generic/Makefile new file mode 100644 index 00000000000..230916e41b7 --- /dev/null +++ b/src/arch/generic/Makefile @@ -0,0 +1,32 @@ +define GENERIC_HELP_STRING +To build Stockfish, run the following command: + +make [ARCH=] [COMP=] + +Build presets for other architectures: + +general-64 > 64-bit generic +general-32 > 32-bit generic + +To see architecture-specific build options, run 'make help ARCH='. +Currently supported values: x86, arm, generic + +endef +export GENERIC_HELP_STRING + +ifeq ($(COMP),$(filter $(COMP),gcc clang icx)) + +ifeq ($(ARCH),general-64) + CXXFLAGS += -m64 +else ifeq ($(ARCH),general-32) + CXXFLAGS += -m32 +endif + +endif + +help-arch: + @echo "$${GENERIC_HELP_STRING}" + +config-sanity-arch: + @test "$(ARCH)" = "general-64" || \ + test "$(ARCH)" = "general-32" diff --git a/src/arch/generic/arch.h b/src/arch/generic/arch.h new file mode 100644 index 00000000000..3641ab6f3f2 --- /dev/null +++ b/src/arch/generic/arch.h @@ -0,0 +1,51 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef GENERIC_ARCH_H_INCLUDED +#define GENERIC_ARCH_H_INCLUDED + +#include +#include + +#include "common.h" + +namespace Stockfish { + +// There is no practical way to detect the register width, so we assume that +// it is always 64-bit if address size is 64-bit. +inline constexpr bool ArchImpl::Is64Bit = sizeof(void*) == 8; + +inline constexpr bool ArchImpl::UsePEXT = false; + +template +inline void ArchImpl::prefetch([[maybe_unused]] const void* m) {} + +template +inline unsigned int ArchImpl::popcount(T n) { + static_assert(std::is_integral_v && sizeof(T) <= 8); + return __popcount_value(n); +} + +template +inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) { + return 0; +} + +} // namespace Stockfish + +#endif // GENERIC_ARCH_H_INCLUDED diff --git a/src/arch/generic/nnue/layers/affine_transform.h b/src/arch/generic/nnue/layers/affine_transform.h new file mode 100644 index 00000000000..341816d3801 --- /dev/null +++ b/src/arch/generic/nnue/layers/affine_transform.h @@ -0,0 +1,59 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#define GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED + +#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +template +constexpr IndexType AffineTransform::get_weight_index(IndexType i) { + return i; +} + +template +void AffineTransform::propagate(const InputType* input, OutputType* output) const { + std::memcpy(output, biases, sizeof(OutputType) * OutputDimensions); + + // Traverse weights in transpose order to take advantage of input sparsity + for (IndexType i = 0; i < InputDimensions; ++i) + { + const InputType in = input[i]; + if (in) + { + const WeightType* w = &weights[i]; + for (IndexType j = 0; j < OutputDimensions; ++j) + output[j] += w[j * PaddedInputDimensions] * in; + } + } +} + +template +using AffineTransformSparseInput = AffineTransform; + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // GENERIC_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED diff --git a/src/arch/generic/nnue/layers/clipped_relu.h b/src/arch/generic/nnue/layers/clipped_relu.h new file mode 100644 index 00000000000..ba965b9b8e7 --- /dev/null +++ b/src/arch/generic/nnue/layers/clipped_relu.h @@ -0,0 +1,40 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED +#define GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED + +#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +template +void ClippedReLU::propagate(const InputType* input, OutputType* output) const { + for (IndexType i = 0; i < InputDimensions; ++i) + output[i] = static_cast(std::clamp(input[i] >> WeightScaleBits, 0, 127)); +} + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // GENERIC_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED diff --git a/src/arch/generic/nnue/layers/sqr_clipped_relu.h b/src/arch/generic/nnue/layers/sqr_clipped_relu.h new file mode 100644 index 00000000000..5048df5406a --- /dev/null +++ b/src/arch/generic/nnue/layers/sqr_clipped_relu.h @@ -0,0 +1,45 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED +#define GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED + +#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +#include +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +template +void SqrClippedReLU::propagate(const InputType* input, OutputType* output) const { + // The correct formula is to divide by 127, but we need to make it fast + // therefore right-shift by extra 7 bits is used instead. Needs to be + // accounted for in the trainer. + for (IndexType i = 0; i < InputDimensions; ++i) + output[i] = static_cast(std::min( + std::int64_t(127), std::int64_t(input[i]) * input[i] >> (2 * WeightScaleBits + 7))); +} + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // GENERIC_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED diff --git a/src/arch/generic/nnue/nnue_feature_transformer.h b/src/arch/generic/nnue/nnue_feature_transformer.h new file mode 100644 index 00000000000..f885980541a --- /dev/null +++ b/src/arch/generic/nnue/nnue_feature_transformer.h @@ -0,0 +1,146 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED +#define GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED + +#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +#include +#include + +#include "position.h" +#include "nnue/nnue_accumulator.h" +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE { + +template StateInfo::*accPtr> +template +void FeatureTransformer::permute_weights() {} + +template StateInfo::*accPtr> +template +void FeatureTransformer:: + apply_accumulator_updates_incremental(StateInfo* computed, + StateInfo* next, + FeatureSet::IndexList& removed, + FeatureSet::IndexList& added) const { + + std::memcpy((next->*accPtr).accumulation[Perspective], + (computed->*accPtr).accumulation[Perspective], HalfDimensions * sizeof(BiasType)); + std::memcpy((next->*accPtr).psqtAccumulation[Perspective], + (computed->*accPtr).psqtAccumulation[Perspective], + PSQTBuckets * sizeof(PSQTWeightType)); + + // Difference calculation for the deactivated features + for (const auto index : removed) + { + const IndexType wOffset = HalfDimensions * index; + const IndexType pwOffset = PSQTBuckets * index; + + for (IndexType i = 0; i < HalfDimensions; ++i) + (next->*accPtr).accumulation[Perspective][i] -= weights[wOffset + i]; + + for (IndexType i = 0; i < PSQTBuckets; ++i) + (next->*accPtr).psqtAccumulation[Perspective][i] -= psqtWeights[pwOffset + i]; + } + + // Difference calculation for the activated features + for (const auto index : added) + { + const IndexType wOffset = HalfDimensions * index; + const IndexType pwOffset = PSQTBuckets * index; + + for (IndexType i = 0; i < HalfDimensions; ++i) + (next->*accPtr).accumulation[Perspective][i] += weights[wOffset + i]; + + for (IndexType i = 0; i < PSQTBuckets; ++i) + (next->*accPtr).psqtAccumulation[Perspective][i] += psqtWeights[pwOffset + i]; + } +} + +template StateInfo::*accPtr> +template +void FeatureTransformer:: + apply_accumulator_updates_refresh_cache( + Accumulator& accumulator, + typename AccumulatorCaches::Cache::Entry& entry, + FeatureSet::IndexList removed, + FeatureSet::IndexList added) const { + for (const auto index : removed) + { + const IndexType wOffset = HalfDimensions * index; + const IndexType pwOffset = PSQTBuckets * index; + + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[j] -= weights[wOffset + j]; + + for (IndexType k = 0; k < PSQTBuckets; ++k) + entry.psqtAccumulation[k] -= psqtWeights[pwOffset + k]; + } + for (const auto index : added) + { + const IndexType wOffset = HalfDimensions * index; + const IndexType pwOffset = PSQTBuckets * index; + + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[j] += weights[wOffset + j]; + + for (IndexType k = 0; k < PSQTBuckets; ++k) + entry.psqtAccumulation[k] += psqtWeights[pwOffset + k]; + } + + // The accumulator of the refresh entry has been updated. + // Now copy its content to the actual accumulator we were refreshing. + std::memcpy(accumulator.accumulation[Perspective], entry.accumulation, + sizeof(BiasType) * HalfDimensions); + std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation, + sizeof(PSQTWeightType) * PSQTBuckets); +} + +template StateInfo::*accPtr> +void FeatureTransformer::convert_accumulators( + const Position& pos, OutputType* output) const { + const int perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; + const auto& accumulation = (pos.state()->*accPtr).accumulation; + + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = (HalfDimensions / 2) * p; + + for (IndexType j = 0; j < HalfDimensions / 2; ++j) + { + BiasType sum0 = accumulation[perspectives[p]][j]; + BiasType sum1 = accumulation[perspectives[p]][j + HalfDimensions / 2]; + sum0 = std::clamp(sum0, 0, 127 * 2); + sum1 = std::clamp(sum1, 0, 127 * 2); + output[offset + j] = static_cast(unsigned(sum0 * sum1) / 512); + } + } +} + +} // namespace Stockfish::Eval::NNUE + +#endif // GENERIC_NNUE_FEATURE_TRANSFORMER_H_INCLUDED diff --git a/src/arch/i386/Makefile b/src/arch/i386/Makefile new file mode 100644 index 00000000000..f9a97330a2a --- /dev/null +++ b/src/arch/i386/Makefile @@ -0,0 +1,116 @@ +define HELP_STRING_I386 +To build Stockfish, run the following command: + +make [ARCH=] [NO_AVX512=1] + +Build presets for x86 architecture: + +x86-64-avx512-vnni > 64-bit, AVX-512 with BW/VL/VNNI +x86-64-avx512 > 64-bit, AVX-512 with BW/VL +x86-64-avxvnni > 64-bit, AVX2 with AVX-VNNI +x86-64-bmi2 > 64-bit, AVX2 and BMI2 +x86-64-avx2 > 64-bit, AVX2 +x86-64-avx > 64-bit, AVX +x86-64-popcnt > 64-bit, POPCNT with SSE4.2 +x86-64-sse41 > 64-bit, SSE4.1 +x86-64-ssse3 > 64-bit, SSSE3 +x86-64 > 64-bit generic (supports up to SSE2) +x86-32 > 32-bit generic (supports up to SSE2) + +Build presets for x86 architecture, AMD processors: + +x86-64-bmi > 64-bit, AVX with BMI +x86-64-abm > 64-bit, SSE4a with ABM + +x86 options: + + NO_AVX512 + +If NO_AVX512 is set, Stockfish will use 256-bit SIMD instructions instead of +512-bit ones. This option is only available if AVX-512 is enabled. + +endef +export HELP_STRING_I386 + +# Compatibility layer for old Makefile presets +ifeq ($(ARCH),x86-64-vnni512) + override ARCH := x86-64-avx512-vnni +endif + +ifeq ($(ARCH),x86-64-vnni256) + override ARCH := x86-64-avx512-vnni + override NO_AVX512 := 1 +endif + +ifneq ($(filter $(COMP),gcc clang icx),) + +ifeq ($(ARCH),x86-64-avx512-vnni) + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \ + -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavx512f -mavx512bw \ + -mavx512vl -mavx512vnni + ifeq ($(NO_AVX512),1) + SF_CXXFLAGS += -mprefer-vector-width=256 -DNO_AVX512 + endif +else ifeq ($(ARCH),x86-64-avx512) + # Xeon Phi is no longer supported on future compiler versions. + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \ + -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavx512f -mavx512bw \ + -mavx512vl + ifeq ($(NO_AVX512),1) + SF_CXXFLAGS += -mprefer-vector-width=256 -DNO_AVX512 + endif +else ifeq ($(ARCH),x86-64-avxvnni) + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \ + -mpopcnt -mavx -mbmi -mavx2 -mbmi2 -mavxvnni +else ifeq ($(ARCH),x86-64-bmi2) + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \ + -mpopcnt -mavx -mbmi -mavx2 -mbmi2 +else ifeq ($(ARCH),x86-64-avx2) + # VIA Eden X4 does not support BMI/BMI2, need a custom profile if required. + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \ + -mpopcnt -mavx -mbmi -mavx2 +else ifeq ($(ARCH),x86-64-bmi) + # Some AMD processors (AMD family 15h) have BMI1 only. + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \ + -mpopcnt -mavx -mbmi +else ifeq ($(ARCH),x86-64-avx) + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \ + -mpopcnt -mavx +else ifeq ($(ARCH),x86-64-popcnt) + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 \ + -mpopcnt +else ifeq ($(ARCH),x86-64-sse41) + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 +else ifeq ($(ARCH),x86-64-ssse3) + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -mssse3 +else ifeq ($(ARCH),x86-64-abm) + SF_CXXFLAGS += -m64 -mmmx -msse -msse2 -msse3 -msse4a -mabm +else ifeq ($(ARCH),x86-64) + # Modern compilers add SSE2 flag by default. + SF_CXXFLAGS += -m64 +else ifeq ($(ARCH),x86-32) + SF_CXXFLAGS += -m32 +endif + +endif # gcc clang icx + +.PHONY: help-arch config-sanity-arch + +help-arch: + @echo "$${HELP_STRING_I386}" + +config-sanity-arch: + @[ "$(ARCH)" = "x86-64-avx512-vnni" -o \ + "$(ARCH)" = "x86-64-avx512" -o \ + "$(ARCH)" = "x86-64-avxvnni" -o \ + "$(ARCH)" = "x86-64-bmi2" -o \ + "$(ARCH)" = "x86-64-avx2" -o \ + "$(ARCH)" = "x86-64-bmi" -o \ + "$(ARCH)" = "x86-64-avx" -o \ + "$(ARCH)" = "x86-64-popcnt" -o \ + "$(ARCH)" = "x86-64-sse41" -o \ + "$(ARCH)" = "x86-64-ssse3" -o \ + "$(ARCH)" = "x86-64-abm" -o \ + "$(ARCH)" = "x86-64" -o \ + "$(ARCH)" = "x86-32" \ + ] diff --git a/src/arch/i386/arch.h b/src/arch/i386/arch.h new file mode 100644 index 00000000000..1ef79088240 --- /dev/null +++ b/src/arch/i386/arch.h @@ -0,0 +1,595 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef I386_ARCH_H_INCLUDED +#define I386_ARCH_H_INCLUDED + +#if !defined(__i386__) && !defined(__amd64__) +#error "Not supported in the current architecture." +#endif + +#include +#include +#include + +#include "common.h" + +#if defined(__AVX__) + +#include + +#elif defined(__SSE4_1__) + +#include + +#elif defined(__SSSE3__) + +#include + +#elif defined(__SSE2__) + +#include + +// Some AMD processors with ABM do not support SSSE3/SSE4.1. +#if defined(__POPCNT__) +#include +#endif + +#elif defined(__SSE__) + +#include + +#endif + +#ifdef ARCH_NATIVE + +// Do not use BMI2 PDEP/PEXT on AMD Zen 1/2. +#if defined(__BMI2__) && (defined(__znver1__) || defined(__znver2__)) +#define NO_PDEP_PEXT 1 +#endif + +// Enable AVX-512 on AMD Zen 5 only. +#if defined(__AVX512F__) && !defined(__znver5__) +#define NO_AVX512 1 +#endif + +#endif // ARCH_NATIVE + +namespace Stockfish { + +enum class PrefetchHint { + ET0 = 7, + T0 = 3, + T1 = 2, + T2 = 1, + NTA = 0 +}; + +// Register size is equal to address bits. +inline constexpr bool ArchImpl::Is64Bit = sizeof(void*) == 8; + +#if defined(__BMI2__) && !defined(NO_PDEP_PEXT) +inline constexpr bool ArchImpl::UsePEXT = true; +#else +inline constexpr bool ArchImpl::UsePEXT = false; +#endif + +template +inline void ArchImpl::prefetch([[maybe_unused]] const void* m) { +#ifdef __SSE__ + constexpr int __Hint = [] { + if constexpr (Hint == -1) + return 3; + else +#ifdef __PRFCHW__ + return (Hint & 0x4) ? 7 : (Hint & 0x3); +#else + return Hint & 0x3; +#endif + }(); + + // GCC doesn't comply with Intel Intrinsics Guide and uses enum instead + // of int. +#if STOCKFISH_COMPILER == STOCKFISH_COMPILER_GCC + _mm_prefetch(m, [] { + if constexpr (__Hint == 7) + return _MM_HINT_ET0; + else if constexpr (__Hint == 3) + return _MM_HINT_T0; + else if constexpr (__Hint == 2) + return _MM_HINT_T1; + else if constexpr (__Hint == 1) + return _MM_HINT_T2; + else + return _MM_HINT_NTA; + }()); +#else + _mm_prefetch(m, __Hint); +#endif + +#endif // __SSE__ +} + +template +inline unsigned int ArchImpl::popcount(T n) { + static_assert(std::is_integral_v && sizeof(T) <= 8); + +#ifdef __POPCNT__ + if constexpr (sizeof(T) == 8) + return _mm_popcnt_u64(std::uint64_t(n)); + else + return _mm_popcnt_u32(std::uint32_t(n)); +#else + if constexpr (!is_64bit() && sizeof(T) == 8) + return __popcount_use_table(n); + else + return __popcount_value(n); +#endif +} + +template +inline T ArchImpl::pext([[maybe_unused]] T n, [[maybe_unused]] T mask) { +#if defined(__BMI2__) && !defined(NO_PDEP_PEXT) + static_assert(std::is_integral_v && (sizeof(T) == 4 || sizeof(T) == 8)); + + if constexpr (sizeof(T) == 8) + return _pext_u64(std::uint64_t(n), std::uint64_t(mask)); + else + return _pext_u32(std::uint32_t(n), std::uint32_t(mask)); +#else + return 0; +#endif +} + +// =========================================================================== +// The functions below are used on i386/AMD64 targets only. +// =========================================================================== + +template +inline std::make_unsigned_t blsr(T n) { + static_assert(std::is_integral_v && (sizeof(T) == 4 || sizeof(T) == 8)); + +#ifdef __BMI__ + if constexpr (sizeof(T) == 8) + return _blsr_u64(std::uint64_t(n)); + else + return _blsr_u32(std::uint32_t(n)); +#else + return std::make_unsigned_t(n) & std::make_unsigned_t(n - 1); +#endif +} + +template +inline int tzcnt(T n) { + static_assert(std::is_integral_v && (sizeof(T) == 4 || sizeof(T) == 8)); + +#ifdef __BMI__ + if constexpr (sizeof(T) == 8) + return _tzcnt_u64(std::uint64_t(n)); + else + return _tzcnt_u32(std::uint32_t(n)); +#else + assert(n != 0); + + if constexpr (sizeof(T) == 8) + return __builtin_ctzll(n); + else + return __builtin_ctz(n); +#endif +} + +#ifdef __SSE2__ + +template +struct is_valid_vector { + static constexpr bool value = sizeof(T) == 16 +#ifdef __AVX2__ + || sizeof(T) == 32 +#endif +#ifdef __AVX512F__ + || sizeof(T) == 64 +#endif + ; +}; + +template +inline constexpr bool is_valid_vector_v = is_valid_vector::value; + +template +inline T _mm_setzero_v() { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) + return _mm512_setzero_si512(); +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) + return _mm256_setzero_si256(); +#endif + + if constexpr (sizeof(T) == 16) + return _mm_setzero_si128(); +} + +template +inline T _mm_set1_epi16_v(std::uint16_t n) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) + return _mm512_set1_epi16(n); +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) + return _mm256_set1_epi16(n); +#endif + + if constexpr (sizeof(T) == 16) + return _mm_set1_epi16(n); +} + +template +inline T _mm_set1_epi32_v(std::uint32_t n) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) + return _mm512_set1_epi32(n); +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) + return _mm256_set1_epi32(n); +#endif + + if constexpr (sizeof(T) == 16) + return _mm_set1_epi32(n); +} + +template +inline T _mm_packus_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) +#ifdef __AVX512BW__ + return _mm512_packus_epi16(a, b); +#else + static_assert(false, "_mm_packus_epi16_v<__m512i> is not allowed without AVX-512 BW."); +#endif +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_packus_epi16(a, b); +#else + static_assert(false, "_mm_packus_epi16_v<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_packus_epi16(a, b); +} + +template +inline T _mm_add_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) +#ifdef __AVX512BW__ + return _mm512_add_epi16(a, b); +#else + static_assert(false, "_mm_add_epi16_v<__m512i> is not allowed without AVX-512 BW."); +#endif +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_add_epi16(a, b); +#else + static_assert(false, "_mm_add_epi16_v<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_add_epi16(a, b); +} + +template +inline T _mm_add_epi32_v(T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) + return _mm512_add_epi32(a, b); +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_add_epi32(a, b); +#else + static_assert(false, "_mm_add_epi32_v<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_add_epi32(a, b); +} + +template +inline T _mm_sub_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) +#ifdef __AVX512BW__ + return _mm512_sub_epi16(a, b); +#else + static_assert(false, "_mm_sub_epi16_v<__m512i> is not allowed without AVX-512 BW."); +#endif +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_sub_epi16(a, b); +#else + static_assert(false, "_mm_sub_epi16_v<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_sub_epi16(a, b); +} + +template +inline T _mm_sub_epi32_v(T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) + return _mm512_sub_epi32(a, b); +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_sub_epi32(a, b); +#else + static_assert(false, "_mm_sub_epi32_v<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_sub_epi32(a, b); +} + +template +inline T _mm_mulhi_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) +#ifdef __AVX512BW__ + return _mm512_mulhi_epi16(a, b); +#else + static_assert(false, "vmulhi_16<__m512i> is not allowed without AVX-512 BW."); +#endif +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_mulhi_epi16(a, b); +#else + static_assert(false, "vmulhi_16<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_mulhi_epi16(a, b); +} + +template +inline T _mm_slli_epi16_v(T a, int n) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) +#ifdef __AVX512BW__ + return _mm512_slli_epi16(a, n); +#else + static_assert(false, "_mm_slli_epi16_v<__m512i> is not allowed without AVX-512 BW."); +#endif +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_slli_epi16(a, n); +#else + static_assert(false, "_mm_slli_epi16_v<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_slli_epi16(a, n); +} + +template +inline T _mm_max_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) +#ifdef __AVX512BW__ + return _mm512_max_epi16(a, b); +#else + static_assert(false, "_mm_max_epi16_v<__m512i> is not allowed without AVX-512 BW."); +#endif +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_max_epi16(a, b); +#else + static_assert(false, "_mm_max_epi16_v<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_max_epi16(a, b); +} + +template +inline T _mm_min_epi16_v(T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) +#ifdef __AVX512BW__ + return _mm512_min_epi16(a, b); +#else + static_assert(false, "vmin_16<__m512i> is not allowed without AVX-512 BW."); +#endif +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) +#ifdef __AVX2__ + return _mm256_min_epi16(a, b); +#else + static_assert(false, "vmin_16<__m256i> is not allowed without AVX2."); +#endif +#endif + + if constexpr (sizeof(T) == 16) + return _mm_min_epi16(a, b); +} + +template +inline std::int32_t _mm_reduce_add_epi32_v(T a) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) + return _mm512_reduce_add_epi32(a); +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) + { + __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1)); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_PERM_BADC)); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, _MM_PERM_CDAB)); + return _mm_cvtsi128_si32(sum); + } +#endif + + if constexpr (sizeof(T) == 16) + { + a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0x4E)); // _MM_PERM_BADC + a = _mm_add_epi32(a, _mm_shuffle_epi32(a, 0xB1)); // _MM_PERM_CDAB + return _mm_cvtsi128_si32(a); + } +} + +// Non-VNNI implementation of dpbusd works even with type saturation, only +// because output values are clamped in ReLU layers immediately after +// AffineTransform layer. Do not use this without VNNI for general purpose. +template +inline void _mm_dpbusd_epi32_v(T& acc, T a, T b) { + static_assert(is_valid_vector_v); + +#ifdef __AVX512F__ + if constexpr (sizeof(T) == 64) + { +#if defined(__AVX512VNNI__) + + acc = _mm512_dpbusd_epi32(acc, a, b); + +#elif defined(__AVX512BW__) + + __m512i product = _mm512_maddubs_epi16(a, b); + product = _mm512_madd_epi16(product, _mm512_set1_epi16(1)); + acc = _mm512_add_epi32(acc, product); + +#else + static_assert(false, "_mm_dpbusd_epi32_v<__m512i> is not allowed without AVX-512 BW."); +#endif + } +#endif + +#ifdef __AVX__ + if constexpr (sizeof(T) == 32) + { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + + acc = _mm256_dpbusd_epi32(acc, a, b); + +#elif defined(__AVX2__) + + __m256i product = _mm256_madd_epi16(_mm256_maddubs_epi16(a, b), _mm256_set1_epi16(1)); + acc = _mm256_add_epi32(acc, product); + +#else + static_assert(false, "_mm_dpbusd_epi32_v<__m256i> is not allowed without AVX2."); +#endif + } +#endif + + if constexpr (sizeof(T) == 16) + { +#if (defined(__AVX512VL__) && defined(__AVX512VNNI__)) || defined(__AVXVNNI__) + + acc = _mm_dpbusd_epi32(acc, a, b); + +#elif defined(__SSSE3__) + + __m128i product = _mm_madd_epi16(_mm_maddubs_epi16(a, b), _mm_set1_epi16(1)); + acc = _mm_add_epi32(acc, product); + +#else + + __m128i a0 = _mm_unpacklo_epi8(a, _mm_setzero_si128()); + __m128i a1 = _mm_unpackhi_epi8(a, _mm_setzero_si128()); + __m128i sgn = _mm_cmplt_epi8(b, _mm_setzero_si128()); + __m128i b0 = _mm_unpacklo_epi8(b, sgn); + __m128i b1 = _mm_unpackhi_epi8(b, sgn); + __m128i product0 = _mm_madd_epi16(a0, b0); + __m128i product1 = _mm_madd_epi16(a1, b1); + __m128i product = _mm_madd_epi16(_mm_packs_epi32(product0, product1), _mm_set1_epi16(1)); + acc = _mm_add_epi32(acc, product); + +#endif + } +} + +#endif // __SSE2__ + +} // namespace Stockfish + +#endif // I386_ARCH_H_INCLUDED diff --git a/src/arch/i386/nnue/layers/affine_transform.h b/src/arch/i386/nnue/layers/affine_transform.h new file mode 100644 index 00000000000..9155089a16a --- /dev/null +++ b/src/arch/i386/nnue/layers/affine_transform.h @@ -0,0 +1,118 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#define I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED + +#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +// Check x86/AMD64 SIMD extensions. +// If none is defined, fall back to the generic implementation. +#ifndef __SSE2__ + +#include "arch/generic/nnue/layers/affine_transform.h" + +#else + +#include "../../arch.h" + +#include +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +template +constexpr IndexType AffineTransform::get_weight_index(IndexType i) { + return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + + i / PaddedInputDimensions * 4 + i % 4; +} + +template +void AffineTransform::propagate(const InputType* input, OutputType* output) const { + if constexpr (OutputDimensions > 1) + { +#if defined(__AVX512F__) && (defined(__AVX512BW__) || defined(__AVX512VNNI__)) \ + && !defined(NO_AVX512) + using vec_t = __m512i; +#elif defined(__AVX2__) + using vec_t = __m256i; +#else + using vec_t = __m128i; +#endif + + static constexpr IndexType OutputLanes = sizeof(vec_t) / sizeof(OutputType); + static_assert(OutputDimensions % OutputLanes == 0); + + static constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; + static constexpr IndexType NumRegs = OutputDimensions / OutputLanes; + + vec_t acc[NumRegs]; + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = reinterpret_cast(biases)[k]; + + for (IndexType i = 0; i < NumChunks; ++i) + { + const vec_t in = + _mm_set1_epi32_v(reinterpret_cast(input)[i]); + const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]); + + for (std::size_t k = 0; k < array_size(acc); ++k) + _mm_dpbusd_epi32_v(acc[k], in, col[k]); + } + + for (std::size_t k = 0; k < array_size(acc); ++k) + reinterpret_cast(output)[k] = acc[k]; + } + else if constexpr (OutputDimensions == 1) + { + // We cannot use AVX512 for the last layer because there are only + // 32 inputs and the buffer is not padded to 64 elements. +#if defined(__AVX2__) + using vec_t = __m256i; +#else + using vec_t = __m128i; +#endif + + static constexpr IndexType InputLanes = sizeof(vec_t) / sizeof(InputType); + static_assert(PaddedInputDimensions % InputLanes == 0); + + static constexpr IndexType NumChunks = PaddedInputDimensions / InputLanes; + + vec_t sum = _mm_setzero_v(); + + for (IndexType j = 0; j < NumChunks; ++j) + { + const vec_t in = reinterpret_cast(input)[j]; + const vec_t row = reinterpret_cast(weights)[j]; + _mm_dpbusd_epi32_v(sum, in, row); + } + + output[0] = _mm_reduce_add_epi32_v(sum) + biases[0]; + } +} + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // !__SSE2__ + +#endif // I386_NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED diff --git a/src/arch/i386/nnue/layers/affine_transform_sparse_input.h b/src/arch/i386/nnue/layers/affine_transform_sparse_input.h new file mode 100644 index 00000000000..dbc20f704ad --- /dev/null +++ b/src/arch/i386/nnue/layers/affine_transform_sparse_input.h @@ -0,0 +1,168 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED +#define I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED + +#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +#include "../../arch.h" + +#include +#include +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +#ifdef __SSSE3__ + +alignas(CacheLineSize) static const std::array, 256> lookupIndices = + [] { + std::array, 256> array{}; + for (std::uint64_t i = 0; i < 256; ++i) + { + std::uint64_t j = i, k = 0; + while (j) + array[i][k++] = tzcnt(j), j = blsr(j); + } + return array; + }(); + +template +class AffineTransformSparseInput: public AffineTransform { + __DEFINE_BASE_PROPERTIES + + static_assert(OutputDimensions % 16 == 0, + "OutputDimensions must be multiple of 16 for this layer."); + + public: + void propagate(const InputType* input, OutputType* output) const; + + private: + template + static IndexType populate_nz_indices(const std::uint8_t* input, std::uint16_t* indices) { +#if defined(__AVX512F__) && !defined(NO_AVX512) + using vec_t = __m512i; +#elif defined(__AVX__) + using vec_t = __m256i; +#else + using vec_t = __m128i; +#endif + + static constexpr IndexType InputLanes = sizeof(vec_t) / 4; + + // Inputs are processed InputLanes at a time and outputs are processed + // 8 at a time so we process in chunks of max(InputLanes, 8). + static constexpr IndexType ChunkSize = std::max(InputLanes, 8); + static constexpr IndexType NumChunks = InputDimensions / ChunkSize; + static constexpr IndexType InputsPerChunk = ChunkSize / InputLanes; + static constexpr IndexType OutputsPerChunk = ChunkSize / 8; + + IndexType count = 0; + __m128i base = _mm_setzero_si128(); + + for (IndexType i = 0; i < NumChunks; ++i) + { + std::uint32_t nnz = 0; + for (IndexType j = 0; j < InputsPerChunk; ++j) + { + const vec_t chunk = reinterpret_cast(input)[i * InputsPerChunk + j]; + + // Since all 32-bit blocks are non-negative, it is safe to use cmpgt + // if the target architecture does not support cmpneq. +#if defined(__AVX512F__) && !defined(NO_AVX512) + const std::uint32_t mask = _mm512_cmpneq_epi32_mask(chunk, _mm512_setzero_si512()); +#elif defined(__AVX2__) + const std::uint32_t mask = _mm256_movemask_ps( + _mm256_castsi256_ps(_mm256_cmpgt_epi32(chunk, _mm256_setzero_si256()))); +#elif defined(__AVX__) + const std::uint32_t mask = _mm256_movemask_ps( + _mm256_cmp_ps(_mm256_castsi256_ps(chunk), _mm256_setzero_ps(), _CMP_NEQ_UQ)); +#else + const std::uint32_t mask = + _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(chunk, _mm_setzero_si128()))); +#endif + + nnz |= mask << (j * InputLanes); + } + for (IndexType j = 0; j < OutputsPerChunk; ++j) + { + const std::uint8_t lookup = (nnz >> (j * 8)) & 0xFF; + const __m128i offsets = *reinterpret_cast(&lookupIndices[lookup]); + _mm_storeu_si128(reinterpret_cast<__m128i_u*>(indices + count), + _mm_add_epi16(base, offsets)); + count += popcount(lookup); + base = _mm_add_epi16(base, _mm_set1_epi16(8)); + } + } + + return count; + } +}; + +template +void AffineTransformSparseInput::propagate(const InputType* input, + OutputType* output) const { +#if defined(__AVX512F__) && (defined(__AVX512BW__) || defined(__AVX512VNNI__)) \ + && !defined(NO_AVX512) + using vec_t = __m512i; +#elif defined(__AVX2__) + using vec_t = __m256i; +#else + using vec_t = __m128i; +#endif + + static constexpr IndexType OutputLanes = sizeof(vec_t) / sizeof(OutputType); + + static constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; + static constexpr IndexType NumRegs = OutputDimensions / OutputLanes; + + vec_t acc[NumRegs]; + std::uint16_t nnz[NumChunks]; + IndexType count = populate_nz_indices(input, nnz); + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = reinterpret_cast(biases)[k]; + + for (IndexType j = 0; j < count; ++j) + { + const auto i = nnz[j]; + const vec_t in = _mm_set1_epi32_v(reinterpret_cast(input)[i]); + const auto col = reinterpret_cast(&weights[i * OutputDimensions * 4]); + for (std::size_t k = 0; k < array_size(acc); ++k) + _mm_dpbusd_epi32_v(acc[k], in, col[k]); + } + + for (std::size_t k = 0; k < array_size(acc); ++k) + reinterpret_cast(output)[k] = acc[k]; +} + +#else + +template +using AffineTransformSparseInput = AffineTransform; + +#endif // __SSSE3__ + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // I386_NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED diff --git a/src/arch/i386/nnue/layers/clipped_relu.h b/src/arch/i386/nnue/layers/clipped_relu.h new file mode 100644 index 00000000000..187d77554f0 --- /dev/null +++ b/src/arch/i386/nnue/layers/clipped_relu.h @@ -0,0 +1,133 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED +#define I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED + +#ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +// Check x86/AMD64 SIMD extensions. +// If none is defined, fall back to the generic implementation. +#ifndef __SSE2__ + +#include "arch/generic/nnue/layers/clipped_relu.h" + +#else + +#include "../../arch.h" + +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +template +void ClippedReLU::propagate(const InputType* input, OutputType* output) const { + static_assert(PaddedOutputDimensions % 32 == 0); + + // Do not use 256-bit registers on AVX as it does not have shift + // instructions, instead fall back to SSE4.1. +#ifdef __AVX2__ + +#if defined(__AVX512F__) && defined(__AVX512BW__) && !defined(NO_AVX512) + if constexpr (PaddedOutputDimensions >= 64) + { + static constexpr IndexType NumChunks = PaddedOutputDimensions / 64; + + static const __m512i permuteTable512 = + _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m512i*>(output); + + for (IndexType i = 0; i < NumChunks; ++i) + { + const __m512i words0 = + _mm512_srli_epi16(_mm512_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits); + const __m512i words1 = + _mm512_srli_epi16(_mm512_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits); + + out[i] = _mm512_permutexvar_epi32(permuteTable512, _mm512_packs_epi16(words0, words1)); + } + } + constexpr IndexType Start = PaddedOutputDimensions / 64 * 64; +#else + constexpr IndexType Start = 0; +#endif + + if constexpr (Start != PaddedOutputDimensions) + { + static constexpr IndexType NumChunks = PaddedOutputDimensions / 32; + + static const __m256i permuteTable256 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m256i*>(output); + + for (IndexType i = Start / 32; i < NumChunks; ++i) + { + const __m256i words0 = + _mm256_srli_epi16(_mm256_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits); + const __m256i words1 = + _mm256_srli_epi16(_mm256_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits); + + out[i] = + _mm256_permutevar8x32_epi32(_mm256_packs_epi16(words0, words1), permuteTable256); + } + } + +#else // __SSE2__ + + static constexpr IndexType NumChunks = ceil_to_multiple(OutputDimensions, 16) / 16; + + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m128i*>(output); + + for (IndexType i = 0; i < NumChunks; ++i) + { +#ifdef __SSE4_1__ + const __m128i words0 = + _mm_srli_epi16(_mm_packus_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits); + const __m128i words1 = + _mm_srli_epi16(_mm_packus_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits); + + out[i] = _mm_packs_epi16(words0, words1); +#else + static const __m128i s8min = _mm_set1_epi8(-0x80); + + const __m128i words0 = + _mm_srai_epi16(_mm_packs_epi32(in[i * 4 + 0], in[i * 4 + 1]), WeightScaleBits); + const __m128i words1 = + _mm_srai_epi16(_mm_packs_epi32(in[i * 4 + 2], in[i * 4 + 3]), WeightScaleBits); + const __m128i bytes = _mm_packs_epi16(words0, words1); + + out[i] = _mm_subs_epi8(_mm_adds_epi8(bytes, s8min), s8min); +#endif + } + +#endif +} + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // !__SSE2__ + +#endif // I386_NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED diff --git a/src/arch/i386/nnue/layers/sqr_clipped_relu.h b/src/arch/i386/nnue/layers/sqr_clipped_relu.h new file mode 100644 index 00000000000..bdab25693d0 --- /dev/null +++ b/src/arch/i386/nnue/layers/sqr_clipped_relu.h @@ -0,0 +1,68 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED +#define I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED + +#ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +// Check x86/AMD64 SIMD extensions. +// If none is defined, fall back to the generic implementation. +#ifndef __SSE2__ + +#include "arch/generic/nnue/layers/sqr_clipped_relu.h" + +#else + +#include "../../arch.h" + +#include + +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE::Layers { + +template +void SqrClippedReLU::propagate(const InputType* input, OutputType* output) const { + static constexpr IndexType NumChunks = PaddedOutputDimensions / 16; + + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m128i*>(output); + + for (IndexType i = 0; i < NumChunks; ++i) + { + __m128i words0 = _mm_packs_epi32(in[i * 4 + 0], in[i * 4 + 1]); + __m128i words1 = _mm_packs_epi32(in[i * 4 + 2], in[i * 4 + 3]); + + // We shift by WeightScaleBits * 2 = 12 and divide by 128 which is + // an additional shift-right of 7, meaning 19 in total. Mulhi + // strips the lower 16 bits so we need to shift by 3 more. + words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3); + words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3); + + out[i] = _mm_packs_epi16(words0, words1); + } +} + +} // namespace Stockfish::Eval::NNUE::Layers + +#endif // !__SSE2__ + +#endif // I386_NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED diff --git a/src/arch/i386/nnue/nnue_feature_transformer.h b/src/arch/i386/nnue/nnue_feature_transformer.h new file mode 100644 index 00000000000..a6d321f465c --- /dev/null +++ b/src/arch/i386/nnue/nnue_feature_transformer.h @@ -0,0 +1,435 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef I386_NNUE_FEATURE_TRANSFORMER_H_INCLUDED +#define I386_NNUE_FEATURE_TRANSFORMER_H_INCLUDED + +#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED +#error "Never use architecture specific header files directly." +#endif + +// Check x86/AMD64 SIMD extensions. +// If none is defined, fall back to the generic implementation. +#ifndef __SSE2__ + +#include "arch/generic/nnue/nnue_feature_transformer.h" + +#else + +#include "../arch.h" + +#include +#include + +#include "misc.h" +#include "position.h" +#include "types.h" +#include "nnue/nnue_accumulator.h" +#include "nnue/nnue_common.h" + +namespace Stockfish::Eval::NNUE { + +template StateInfo::*accPtr> +struct FeatureTransformer::Details { +#if defined(__AVX512F__) && defined(__AVX512BW__) && !defined(NO_AVX512) + // The size of the current PSQT weights array is too small for AVX-512. + using vec_t = __m512i; + using psqt_vec_t = __m256i; +#elif defined(__AVX2__) + using vec_t = __m256i; + using psqt_vec_t = __m256i; +#else + using vec_t = __m128i; + using psqt_vec_t = __m128i; +#endif + + private: +#if defined(__AVX512F__) + // EVEX enconding scheme, but uses 16 only. Need to check <=32 + static constexpr int NumXMM = 16; +#else + static constexpr int NumXMM = is_64bit() ? 16 : 8; +#endif + + public: + static constexpr std::size_t AccRegisterSize = sizeof(vec_t); + static constexpr std::size_t PSQTRegisterSize = sizeof(psqt_vec_t); + + static constexpr int OptimalAccRegisterCount = + optimal_register_count(); + static constexpr int OptimalPSQTRegisterCount = + optimal_register_count(); + + static constexpr IndexType TileHeight = + OptimalAccRegisterCount * AccRegisterSize / sizeof(WeightType); + static constexpr IndexType PsqtTileHeight = + OptimalPSQTRegisterCount * PSQTRegisterSize / sizeof(PSQTWeightType); + + static_assert(HalfDimensions % TileHeight == 0, + "HalfDimensions must be multiple of TileHeight"); + static_assert(PSQTBuckets % PsqtTileHeight == 0, + "PSQTBuckets must be multiple of PsqtTileHeight"); +}; + +template +static inline constexpr void permute_pack(std::uint64_t* v) { + if constexpr (RegisterSize == 64) + if constexpr (Write) + { + std::uint64_t tmp0 = v[2], tmp1 = v[3]; + v[2] = v[8], v[3] = v[9]; + v[8] = v[4], v[9] = v[5]; + v[4] = tmp0, v[5] = tmp1; + tmp0 = v[6], tmp1 = v[7]; + v[6] = v[10], v[7] = v[11]; + v[10] = v[12], v[11] = v[13]; + v[12] = tmp0, v[13] = tmp1; + } + else + { + std::uint64_t tmp0 = v[2], tmp1 = v[3]; + v[2] = v[4], v[3] = v[5]; + v[4] = v[8], v[5] = v[9]; + v[8] = tmp0, v[9] = tmp1; + tmp0 = v[6], tmp1 = v[7]; + v[6] = v[12], v[7] = v[13]; + v[12] = v[10], v[13] = v[11]; + v[10] = tmp0, v[11] = tmp1; + } + else if constexpr (RegisterSize == 32) + { + std::swap(v[2], v[4]); + std::swap(v[3], v[5]); + } +} + +template StateInfo::*accPtr> +template +void FeatureTransformer::permute_weights() { + // The weight numbers are permuted preliminarily, due to the use of + // AVX2/AVX-512 pack intrinsics. + if constexpr (Details::AccRegisterSize >= 32) + { + constexpr IndexType Width = Details::AccRegisterSize == 64 ? 16 : 8; + + for (IndexType i = 0; i < HalfDimensions * sizeof(BiasType) / 8; i += Width) + permute_pack( + &reinterpret_cast(biases)[i]); + + for (IndexType j = 0; j < InputDimensions; ++j) + for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / 8; i += Width) + permute_pack( + &reinterpret_cast(&weights[j * HalfDimensions])[i]); + } +} + +template StateInfo::*accPtr> +template +void FeatureTransformer:: + apply_accumulator_updates_incremental(StateInfo* computed, + StateInfo* next, + FeatureSet::IndexList& removed, + FeatureSet::IndexList& added) const { + using vec_t = typename Details::vec_t; + using psqt_vec_t = typename Details::psqt_vec_t; + + // The most common case when updating the accumulator incrementally. + // Calculates feature differences directly without using tiling mechanism. + if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1) + { + const auto accIn = + reinterpret_cast(&(computed->*accPtr).accumulation[Perspective][0]); + const auto accOut = reinterpret_cast(&(next->*accPtr).accumulation[Perspective][0]); + + const IndexType offsetR0 = HalfDimensions * removed[0]; + const auto columnR0 = reinterpret_cast(&weights[offsetR0]); + const IndexType offsetA = HalfDimensions * added[0]; + const auto columnA = reinterpret_cast(&weights[offsetA]); + + if (removed.size() == 1) + { + for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) + accOut[i] = _mm_add_epi16_v(_mm_sub_epi16_v(accIn[i], columnR0[i]), columnA[i]); + } + else + { + const IndexType offsetR1 = HalfDimensions * removed[1]; + const auto columnR1 = reinterpret_cast(&weights[offsetR1]); + + for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) + accOut[i] = _mm_sub_epi16_v(_mm_add_epi16_v(accIn[i], columnA[i]), + _mm_add_epi16_v(columnR0[i], columnR1[i])); + } + + const auto accPsqtIn = reinterpret_cast( + &(computed->*accPtr).psqtAccumulation[Perspective][0]); + const auto accPsqtOut = + reinterpret_cast(&(next->*accPtr).psqtAccumulation[Perspective][0]); + + const IndexType offsetPsqtR0 = PSQTBuckets * removed[0]; + auto columnPsqtR0 = reinterpret_cast(&psqtWeights[offsetPsqtR0]); + const IndexType offsetPsqtA = PSQTBuckets * added[0]; + auto columnPsqtA = reinterpret_cast(&psqtWeights[offsetPsqtA]); + + if (removed.size() == 1) + { + for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); + ++i) + accPsqtOut[i] = + _mm_add_epi32_v(_mm_sub_epi32_v(accPsqtIn[i], columnPsqtR0[i]), columnPsqtA[i]); + } + else + { + const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; + const auto columnPsqtR1 = + reinterpret_cast(&psqtWeights[offsetPsqtR1]); + + for (IndexType i = 0; i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); + ++i) + accPsqtOut[i] = _mm_sub_epi32_v(_mm_add_epi32_v(accPsqtIn[i], columnPsqtA[i]), + _mm_add_epi32_v(columnPsqtR0[i], columnPsqtR1[i])); + } + } + else + { + vec_t acc[Details::OptimalAccRegisterCount]; + + for (IndexType i = 0; i < HalfDimensions / Details::TileHeight; ++i) + { + const IndexType offsetRow = i * Details::TileHeight; + + const auto accTileIn = reinterpret_cast( + &(computed->*accPtr).accumulation[Perspective][offsetRow]); + for (std::size_t j = 0; j < array_size(acc); ++j) + acc[j] = accTileIn[j]; + + for (const auto index : removed) + { + const IndexType offset = HalfDimensions * index + offsetRow; + const auto column = reinterpret_cast(&weights[offset]); + for (std::size_t j = 0; j < array_size(acc); ++j) + acc[j] = _mm_sub_epi16_v(acc[j], column[j]); + } + + for (const auto index : added) + { + const IndexType offset = HalfDimensions * index + offsetRow; + const auto column = reinterpret_cast(&weights[offset]); + for (std::size_t j = 0; j < array_size(acc); ++j) + acc[j] = _mm_add_epi16_v(acc[j], column[j]); + } + + const auto accTileOut = + reinterpret_cast(&(next->*accPtr).accumulation[Perspective][offsetRow]); + for (std::size_t j = 0; j < array_size(acc); ++j) + accTileOut[j] = acc[j]; + } + + psqt_vec_t psqt[Details::OptimalPSQTRegisterCount]; + + for (IndexType i = 0; i < PSQTBuckets / Details::PsqtTileHeight; ++i) + { + const IndexType offsetRow = i * Details::PsqtTileHeight; + + auto accTilePsqtIn = reinterpret_cast( + &(computed->*accPtr).psqtAccumulation[Perspective][offsetRow]); + for (std::size_t j = 0; j < array_size(psqt); ++j) + psqt[j] = accTilePsqtIn[j]; + + for (const auto index : removed) + { + const IndexType offset = PSQTBuckets * index + offsetRow; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t j = 0; j < array_size(psqt); ++j) + psqt[j] = _mm_sub_epi32_v(psqt[j], columnPsqt[j]); + } + + for (const auto index : added) + { + const IndexType offset = PSQTBuckets * index + offsetRow; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t j = 0; j < array_size(psqt); ++j) + psqt[j] = _mm_add_epi32_v(psqt[j], columnPsqt[j]); + } + + auto accTilePsqtOut = reinterpret_cast( + &(next->*accPtr).psqtAccumulation[Perspective][offsetRow]); + for (std::size_t j = 0; j < array_size(psqt); ++j) + accTilePsqtOut[j] = psqt[j]; + } + } +} + +template StateInfo::*accPtr> +template +void FeatureTransformer:: + apply_accumulator_updates_refresh_cache( + Accumulator& accumulator, + typename AccumulatorCaches::Cache::Entry& entry, + FeatureSet::IndexList removed, + FeatureSet::IndexList added) const { + using vec_t = typename Details::vec_t; + using psqt_vec_t = typename Details::psqt_vec_t; + + vec_t acc[Details::OptimalAccRegisterCount]; + + for (IndexType j = 0; j < HalfDimensions / Details::TileHeight; ++j) + { + const IndexType offsetRow = j * Details::TileHeight; + + const auto accTile = + reinterpret_cast(&accumulator.accumulation[Perspective][offsetRow]); + const auto entryTile = reinterpret_cast(&entry.accumulation[offsetRow]); + + for (IndexType k = 0; k < array_size(acc); ++k) + acc[k] = entryTile[k]; + + std::size_t i = 0; + for (; i < std::min(removed.size(), added.size()); ++i) + { + const IndexType offsetR = HalfDimensions * removed[i] + offsetRow; + const auto columnR = reinterpret_cast(&weights[offsetR]); + const IndexType offsetA = HalfDimensions * added[i] + offsetRow; + const auto columnA = reinterpret_cast(&weights[offsetA]); + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = _mm_add_epi16_v(acc[k], _mm_sub_epi16_v(columnA[k], columnR[k])); + } + for (; i < removed.size(); ++i) + { + const IndexType offset = HalfDimensions * removed[i] + offsetRow; + const auto column = reinterpret_cast(&weights[offset]); + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = _mm_sub_epi16_v(acc[k], column[k]); + } + for (; i < added.size(); ++i) + { + const IndexType offset = HalfDimensions * added[i] + offsetRow; + const auto column = reinterpret_cast(&weights[offset]); + + for (std::size_t k = 0; k < array_size(acc); ++k) + acc[k] = _mm_add_epi16_v(acc[k], column[k]); + } + + for (std::size_t k = 0; k < array_size(acc); k++) + entryTile[k] = acc[k]; + for (std::size_t k = 0; k < array_size(acc); k++) + accTile[k] = acc[k]; + } + + psqt_vec_t psqt[Details::OptimalPSQTRegisterCount]; + + for (IndexType j = 0; j < PSQTBuckets / Details::PsqtTileHeight; ++j) + { + const IndexType offsetRow = j * Details::PsqtTileHeight; + + const auto accTilePsqt = + reinterpret_cast(&accumulator.psqtAccumulation[Perspective][offsetRow]); + const auto entryTilePsqt = + reinterpret_cast(&entry.psqtAccumulation[offsetRow]); + + for (std::size_t k = 0; k < array_size(psqt); ++k) + psqt[k] = entryTilePsqt[k]; + + for (std::size_t i = 0; i < removed.size(); ++i) + { + const IndexType offset = PSQTBuckets * removed[i] + offsetRow; + const auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + + for (std::size_t k = 0; k < array_size(psqt); ++k) + psqt[k] = _mm_sub_epi32_v(psqt[k], columnPsqt[k]); + } + for (std::size_t i = 0; i < added.size(); ++i) + { + const IndexType offset = PSQTBuckets * added[i] + offsetRow; + const auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + + for (std::size_t k = 0; k < array_size(psqt); ++k) + psqt[k] = _mm_add_epi32_v(psqt[k], columnPsqt[k]); + } + + for (std::size_t k = 0; k < array_size(psqt); ++k) + entryTilePsqt[k] = psqt[k]; + for (std::size_t k = 0; k < array_size(psqt); ++k) + accTilePsqt[k] = psqt[k]; + } +} + +template StateInfo::*accPtr> +void FeatureTransformer::convert_accumulators( + const Position& pos, OutputType* output) const { + using vec_t = typename Details::vec_t; + + static constexpr IndexType OutputChunkSize = Details::AccRegisterSize / sizeof(OutputType); + static_assert((HalfDimensions / 2) % OutputChunkSize == 0); + + static constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; + + const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; + const auto& accumulation = (pos.state()->*accPtr).accumulation; + + for (IndexType p = 0; p < 2; ++p) + { + const auto in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); + const auto in1 = + reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); + const auto out = reinterpret_cast(&output[(HalfDimensions / 2) * p]); + + for (IndexType j = 0; j < NumOutputChunks; ++j) + { + // What we want to do is multiply inputs in a pairwise manner + // (after clipping), and then shift right by 9. Instead, we + // shift left by 7, and use mulhi, stripping the bottom 16 bits, + // effectively shifting right by 16, resulting in a net shift + // of 9 bits. We use mulhi because it maintains the sign of + // the multiplication (unlike mullo), allowing us to make use + // of packus to clip 2 of the inputs, resulting in a save of 2 + // "_mm_max_epi16_v" calls. + + static const vec_t Zeroes = _mm_setzero_v(); + static const vec_t Ones = _mm_set1_epi16_v(127 * 2); + + const vec_t sum0a = + _mm_slli_epi16_v(_mm_max_epi16_v(_mm_min_epi16_v(in0[j * 2 + 0], Ones), Zeroes), 7); + const vec_t sum0b = + _mm_slli_epi16_v(_mm_max_epi16_v(_mm_min_epi16_v(in0[j * 2 + 1], Ones), Zeroes), 7); + const vec_t sum1a = _mm_min_epi16_v(in1[j * 2 + 0], Ones); + const vec_t sum1b = _mm_min_epi16_v(in1[j * 2 + 1], Ones); + + const vec_t pa = _mm_mulhi_epi16_v(sum0a, sum1a); + const vec_t pb = _mm_mulhi_epi16_v(sum0b, sum1b); + + out[j] = _mm_packus_epi16_v(pa, pb); + } + } +} + +} // namespace Stockfish::Eval::NNUE + +#endif // !__SSE2__ + +#endif // I386_NNUE_FEATURE_TRANSFORMER_H_INCLUDED diff --git a/src/bitboard.cpp b/src/bitboard.cpp index a8b4e5f4464..5545e5133e1 100644 --- a/src/bitboard.cpp +++ b/src/bitboard.cpp @@ -16,17 +16,16 @@ along with this program. If not, see . */ -#include "bitboard.h" #include #include #include +#include "bitboard.h" #include "misc.h" namespace Stockfish { -uint8_t PopCnt16[1 << 16]; uint8_t SquareDistance[SQUARE_NB][SQUARE_NB]; Bitboard LineBB[SQUARE_NB][SQUARE_NB]; @@ -50,34 +49,13 @@ Bitboard safe_destination(Square s, int step) { Square to = Square(s + step); return is_ok(to) && distance(s, to) <= 2 ? square_bb(to) : Bitboard(0); } -} - -// Returns an ASCII representation of a bitboard suitable -// to be printed to standard output. Useful for debugging. -std::string Bitboards::pretty(Bitboard b) { - - std::string s = "+---+---+---+---+---+---+---+---+\n"; - - for (Rank r = RANK_8; r >= RANK_1; --r) - { - for (File f = FILE_A; f <= FILE_H; ++f) - s += b & make_square(f, r) ? "| X " : "| "; - - s += "| " + std::to_string(1 + r) + "\n+---+---+---+---+---+---+---+---+\n"; - } - s += " a b c d e f g h\n"; - - return s; -} +} // namespace // Initializes various bitboard tables. It is called at // startup and relies on global objects to be already zero-initialized. void Bitboards::init() { - for (unsigned i = 0; i < (1 << 16); ++i) - PopCnt16[i] = uint8_t(std::bitset<16>(i).count()); - for (Square s1 = SQ_A1; s1 <= SQ_H8; ++s1) for (Square s2 = SQ_A1; s2 <= SQ_H8; ++s2) SquareDistance[s1][s2] = std::max(distance(s1, s2), distance(s1, s2)); @@ -163,7 +141,7 @@ void init_magics(PieceType pt, Bitboard table[], Magic magics[]) { // apply to the 64 or 32 bits word to get the index. Magic& m = magics[s]; m.mask = sliding_attack(pt, s, 0) & ~edges; - m.shift = (Is64Bit ? 64 : 32) - popcount(m.mask); + m.shift = (is_64bit() ? 64 : 32) - popcount(m.mask); // Set the offset for the attacks table of the square. We have individual // table sizes for each square with "Fancy Magic Bitboards". @@ -177,17 +155,17 @@ void init_magics(PieceType pt, Bitboard table[], Magic magics[]) { occupancy[size] = b; reference[size] = sliding_attack(pt, s, b); - if (HasPext) + if constexpr (use_pext()) m.attacks[pext(b, m.mask)] = reference[size]; size++; b = (b - m.mask) & m.mask; } while (b); - if (HasPext) + if constexpr (use_pext()) continue; - PRNG rng(seeds[Is64Bit][rank_of(s)]); + PRNG rng(seeds[is_64bit()][rank_of(s)]); // Find a magic for square 's' picking up an (almost) random number // until we find the one that passes the verification test. diff --git a/src/bitboard.h b/src/bitboard.h index cdff4c759bc..d75ec420d85 100644 --- a/src/bitboard.h +++ b/src/bitboard.h @@ -26,14 +26,14 @@ #include #include +#include "common.h" #include "types.h" namespace Stockfish { namespace Bitboards { -void init(); -std::string pretty(Bitboard b); +void init(); } // namespace Stockfish::Bitboards @@ -55,7 +55,6 @@ constexpr Bitboard Rank6BB = Rank1BB << (8 * 5); constexpr Bitboard Rank7BB = Rank1BB << (8 * 6); constexpr Bitboard Rank8BB = Rank1BB << (8 * 7); -extern uint8_t PopCnt16[1 << 16]; extern uint8_t SquareDistance[SQUARE_NB][SQUARE_NB]; extern Bitboard BetweenBB[SQUARE_NB][SQUARE_NB]; @@ -73,11 +72,10 @@ struct Magic { // Compute the attack's index using the 'magic bitboards' approach unsigned index(Bitboard occupied) const { - - if (HasPext) + if constexpr (use_pext()) return unsigned(pext(occupied, mask)); - if (Is64Bit) + if constexpr (is_64bit()) return unsigned(((occupied & mask) * magic) >> shift); unsigned lo = unsigned(occupied) & unsigned(mask); @@ -259,29 +257,6 @@ inline Bitboard attacks_bb(PieceType pt, Square s, Bitboard occupied) { } } - -// Counts the number of non-zero bits in a bitboard. -inline int popcount(Bitboard b) { - -#ifndef USE_POPCNT - - union { - Bitboard bb; - uint16_t u[4]; - } v = {b}; - return PopCnt16[v.u[0]] + PopCnt16[v.u[1]] + PopCnt16[v.u[2]] + PopCnt16[v.u[3]]; - -#elif defined(_MSC_VER) - - return int(_mm_popcnt_u64(b)); - -#else // Assumed gcc or compatible compiler - - return __builtin_popcountll(b); - -#endif -} - // Returns the least significant bit in a non-zero bitboard. inline Square lsb(Bitboard b) { assert(b); diff --git a/src/common.h b/src/common.h new file mode 100644 index 00000000000..eaf5c2ed551 --- /dev/null +++ b/src/common.h @@ -0,0 +1,205 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef COMMON_H_INCLUDED +#define COMMON_H_INCLUDED + +#include +#include +#include +#include +#include +#include + +// When compiling with provided Makefile (e.g. for Linux and OSX), +// configuration is done automatically. To get started, type 'make help'. +// +// When Makefile is not used (e.g. with Microsoft Visual Studio), some macros +// need to be pre-defined manually: +// +// NDEBUG Disable debugging mode. Always use this for release. +// +// __SSE__ Generate x86 prefetch instruction. +// __SSE2__ Generate x86 SSE2 SIMD instructions. +// __SSSE3__ Generate x86 SSSE3 SIMD instructions. +// __SSE4_1__ Generate x86 SSE4.1 SIMD instructions. +// __POPCNT__ Generate x86 POPCNT instruction. +// __PRFCHW__ Generate x86 PREFETCHW instruction. (not used currently) +// __AVX__ Generate x86 AVX SIMD instructions. +// __BMI__ Generate x86 BLSR and TZCNT instructions. +// __AVX2__ Generate x86 AVX2 SIMD instructions. +// __BMI2__ Generate x86 PEXT instruction. +// __AVX512F__ Generate x86 AVX-512 SIMD instructions. +// __AVX512BW__ ... +// __AVX512VL__ ... +// __AVX512VNNI__ ... + +#define STOCKFISH_COMPILER_UNKNOWN 0 +#define STOCKFISH_COMPILER_GCC 1 +#define STOCKFISH_COMPILER_CLANG 2 +#define STOCKFISH_COMPILER_INTEL 3 +#define STOCKFISH_COMPILER_MSVC 4 + +#if defined(__GNUC__) + #if defined(__INTEL_LLVM_COMPILER) + #define STOCKFISH_COMPILER STOCKFISH_COMPILER_INTEL + #elif defined(__clang__) + #define STOCKFISH_COMPILER STOCKFISH_COMPILER_CLANG + #else + #define STOCKFISH_COMPILER STOCKFISH_COMPILER_GCC + #endif +#elif defined(_MSC_VER) + #define STOCKFISH_COMPILER STOCKFISH_COMPILER_MSVC +#else + #define STOCKFISH_COMPILER STOCKFISH_COMPILER_UNKNOWN +#endif + +#if STOCKFISH_COMPILER == STOCKFISH_COMPILER_GCC + #if __GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2) + #define ALIGNAS_ON_STACK_VARIABLES_BROKEN + #endif +#elif STOCKFISH_COMPILER == STOCKFISH_COMPILER_MSVC + #pragma warning(disable: 4127) // Conditional expression is constant + #pragma warning(disable: 4146) // Unary minus operator applied to unsigned type + #pragma warning(disable: 4800) // Forcing value to bool 'true' or 'false' + + #if defined(_WIN64) + #include + #endif +#endif + +#define ASSERT_ALIGNED(ptr, alignment) \ + assert(reinterpret_cast(ptr) % alignment == 0) + +namespace Stockfish { + +template +constexpr std::size_t array_size(T (&)[N]) { + return N; +} + +// Instead of using raw integer values, give each hint a comprehensive name. +// Default is always -1, however, the actual value of it is defined in +// implementation detail. +enum class PrefetchHint; + +// This struct is used to provide generalized functionalities that might have +// different implementations depending on the target architecture. Each +// function defined in this struct is specialized in arch.h file respectively. +struct ArchImpl { + /// Does the struct name fit to its purpose? + + static const bool Is64Bit; + + static const bool UsePEXT; // used in Bitboard + + // Clang apparently does not follow SFIANE in if constexpr statements, + // therefore annotate arguments with maybe_unused attribute to avoid + // warnings. + + template + static inline void prefetch(const void* m); + + template + static inline unsigned int popcount(T n); + + template + static inline T pext(T n, T mask); +}; + +constexpr bool is_64bit() { return ArchImpl::Is64Bit; } + +constexpr bool use_pext() { return ArchImpl::UsePEXT; } + +// 64-bit builtin popcount is sometimes slower than using table, especially on +// 32-bit environment. Therefore we provide two versions of it and leave it +// for each platform to decide which one to use. +template +inline int __popcount_use_table(T n) { + static_assert(std::is_integral_v && sizeof(T) % 2 == 0); + + static const std::array popcntTable = [] { + std::array table; + for (int i = 0; i < 1 << 16; ++i) + table[i] = std::uint8_t(std::bitset<16>(i).count()); + return table; + }(); + + union { + T raw; + std::uint16_t words[sizeof(T) / 2]; + } v = {n}; + + int count = 0; + for (std::size_t i = 0; i < sizeof(T) / 2; ++i) + count += popcntTable[v.words[i]]; + + return count; +} + +template +inline int __popcount_value(T n) { +#ifdef __GNUC__ + if constexpr (sizeof(T) == 8) + return __builtin_popcountll(std::uint64_t(n)); + else + return __builtin_popcount(std::uint32_t(n)); +#else + if constexpr (sizeof(T) == 8) + return __popcount_value(std::uint32_t(n)) + __popcount_value(std::uint32_t(n >> 32)); + else + { + n = n - ((n >> 1) & 0x55555555); + n = (n & 0x33333333) + ((n >> 2) & 0x33333333); + return (((n + (n >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; + } +#endif +} + +template(-1)> +inline void prefetch(const void* m) { + return ArchImpl::prefetch(Hint)>(m); +} + +template +inline unsigned int popcount(T n) { + return ArchImpl::popcount(n); +} + +template +inline T pext(T n, T mask) { + return ArchImpl::pext(n, mask); +} + +} // namespace Stockfish + +#if defined(__i386__) || defined(__amd64__) + + #include "arch/i386/arch.h" + +#elif defined(__arm__) || defined(__aarch64__) + + #include "arch/arm/arch.h" + +#else + + #include "arch/generic/arch.h" + +#endif + +#endif // COMMON_H_INCLUDED diff --git a/src/engine.cpp b/src/engine.cpp index b5cc3f832f5..04d0be51b3a 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -28,6 +28,7 @@ #include #include +#include "common.h" #include "evaluate.h" #include "misc.h" #include "nnue/network.h" @@ -45,7 +46,7 @@ namespace Stockfish { namespace NN = Eval::NNUE; constexpr auto StartFEN = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"; -constexpr int MaxHashMB = Is64Bit ? 33554432 : 2048; +constexpr int MaxHashMB = is_64bit() ? 33554432 : 2048; Engine::Engine(std::optional path) : binaryDirectory(path ? CommandLine::get_binary_directory(*path) : ""), diff --git a/src/memory.h b/src/memory.h index 3155a5aab12..f10bd6e56d9 100644 --- a/src/memory.h +++ b/src/memory.h @@ -27,7 +27,7 @@ #include #include -#include "types.h" +#include "common.h" namespace Stockfish { diff --git a/src/misc.cpp b/src/misc.cpp index 664ab4b89ff..39136606d94 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -16,8 +16,6 @@ along with this program. If not, see . */ -#include "misc.h" - #include #include #include @@ -31,6 +29,8 @@ #include #include +#include "common.h" +#include "misc.h" #include "types.h" namespace Stockfish { @@ -159,119 +159,119 @@ std::string engine_info(bool to_uci) { // Returns a string trying to describe the compiler we use std::string compiler_info() { + std::stringstream ss; -#define make_version_string(major, minor, patch) \ - stringify(major) "." stringify(minor) "." stringify(patch) - - // Predefined macros hell: - // - // __GNUC__ Compiler is GCC, Clang or ICX - // __clang__ Compiler is Clang or ICX - // __INTEL_LLVM_COMPILER Compiler is ICX - // _MSC_VER Compiler is MSVC - // _WIN32 Building on Windows (any) - // _WIN64 Building on Windows 64 bit - - std::string compiler = "\nCompiled by : "; + ss << "Compiler : "; #if defined(__INTEL_LLVM_COMPILER) - compiler += "ICX "; - compiler += stringify(__INTEL_LLVM_COMPILER); + ss << "ICX "; + ss << stringify(__VERSION); #elif defined(__clang__) - compiler += "clang++ "; - compiler += make_version_string(__clang_major__, __clang_minor__, __clang_patchlevel__); -#elif _MSC_VER - compiler += "MSVC "; - compiler += "(version "; - compiler += stringify(_MSC_FULL_VER) "." stringify(_MSC_BUILD); - compiler += ")"; + ss << "Clang "; + ss << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__; #elif defined(__e2k__) && defined(__LCC__) - #define dot_ver2(n) \ - compiler += char('.'); \ - compiler += char('0' + (n) / 10); \ - compiler += char('0' + (n) % 10); - - compiler += "MCST LCC "; - compiler += "(version "; - compiler += std::to_string(__LCC__ / 100); - dot_ver2(__LCC__ % 100) dot_ver2(__LCC_MINOR__) compiler += ")"; -#elif __GNUC__ - compiler += "g++ (GNUC) "; - compiler += make_version_string(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); + ss << "MCST LCC "; + ss << (__LCC__ / 100) << '.' << std::setfill('0') << std::setw(2) << (__LCC__ % 100) << '.' + << __LCC_MINOR__; + ss.clear(); +#elif defined(__GNUC__) + ss << "g++ "; + ss << __GNUC__ << '.' << __GNUC_MINOR__ << '.' << __GNUC_PATCHLEVEL__; +#elif defined(_MSC_VER) + ss << "MSVC "; + ss << _MSC_FULL_VER << '.' << _MSC_BUILD; #else - compiler += "Unknown compiler "; - compiler += "(unknown version)"; + ss << "Unknown compiler"; #endif #if defined(__APPLE__) - compiler += " on Apple"; + ss << " on Apple"; #elif defined(__CYGWIN__) - compiler += " on Cygwin"; -#elif defined(__MINGW64__) - compiler += " on MinGW64"; + ss << " on Cygwin"; #elif defined(__MINGW32__) - compiler += " on MinGW32"; -#elif defined(__ANDROID__) - compiler += " on Android"; + ss << " on MinGW"; #elif defined(__linux__) - compiler += " on Linux"; + ss << " on Linux"; #elif defined(_WIN64) - compiler += " on Microsoft Windows 64-bit"; + ss << " on Microsoft Windows 64-bit"; #elif defined(_WIN32) - compiler += " on Microsoft Windows 32-bit"; + ss << " on Microsoft Windows 32-bit"; +#else + ss << " on unknown system"; +#endif + + ss << "\n"; + ss << "Build type : "; + +#ifdef NDEBUG + ss << "Release"; #else - compiler += " on unknown system"; + ss << "Debug"; #endif - compiler += "\nCompilation architecture : "; -#if defined(ARCH) - compiler += stringify(ARCH); + ss << "\n"; + ss << "Build profile : "; + +#ifdef ARCH + ss << stringify(ARCH); #else - compiler += "(undefined architecture)"; + ss << "unknown"; #endif - compiler += "\nCompilation settings : "; - compiler += (Is64Bit ? "64bit" : "32bit"); -#if defined(USE_VNNI) - compiler += " VNNI"; + ss << "\n"; + ss << "Compile options : "; + ss << (is_64bit() ? "64bit" : "32bit"); + +// x86/AMD64 family +#ifdef __AVX512F__ + ss << " AVX-512 (F"; + #ifdef __AVX512BW__ + ss << ",BW"; + #endif + #ifdef __AVX512VL__ + ss << ",VL"; + #endif + #ifdef __AVX512VNNI__ + ss << ",VNNI"; + #endif + ss << ")"; #endif -#if defined(USE_AVX512) - compiler += " AVX512"; +#ifdef __BMI2__ + ss << " BMI2"; #endif - compiler += (HasPext ? " BMI2" : ""); -#if defined(USE_AVX2) - compiler += " AVX2"; +#ifdef __AVX2__ + ss << " AVX2"; #endif -#if defined(USE_SSE41) - compiler += " SSE41"; +#ifdef __BMI__ + ss << " BMI"; #endif -#if defined(USE_SSSE3) - compiler += " SSSE3"; +#ifdef __AVX__ + ss << " AVX"; #endif -#if defined(USE_SSE2) - compiler += " SSE2"; +#ifdef __POPCNT__ + ss << " POPCNT"; #endif - compiler += (HasPopCnt ? " POPCNT" : ""); -#if defined(USE_NEON_DOTPROD) - compiler += " NEON_DOTPROD"; -#elif defined(USE_NEON) - compiler += " NEON"; +#ifdef __SSE4_1__ + ss << " SSE4.1"; #endif - -#if !defined(NDEBUG) - compiler += " DEBUG"; +#ifdef __SSSE3__ + ss << " SSSE3"; +#endif +#ifdef __SSE2__ + ss << " SSE2"; #endif - compiler += "\nCompiler __VERSION__ macro : "; -#ifdef __VERSION__ - compiler += __VERSION__; -#else - compiler += "(undefined macro)"; +// ARM/AArch64 family +#ifdef __ARM_FEATURE_DOTPROD + ss << " DotProd"; +#endif +#ifdef __ARM_NEON + ss << " Neon"; #endif - compiler += "\n"; + ss << "\n"; - return compiler; + return ss.str(); } @@ -407,24 +407,6 @@ void sync_cout_end() { std::cout << IO_UNLOCK; } // Trampoline helper to avoid moving Logger to misc.h void start_logger(const std::string& fname) { Logger::start(fname); } - -#ifdef NO_PREFETCH - -void prefetch(const void*) {} - -#else - -void prefetch(const void* addr) { - - #if defined(_MSC_VER) - _mm_prefetch((char const*) addr, _MM_HINT_T0); - #else - __builtin_prefetch(addr); - #endif -} - -#endif - #ifdef _WIN32 #include #define GETCWD _getcwd diff --git a/src/misc.h b/src/misc.h index ce49a1f6553..34d8c443018 100644 --- a/src/misc.h +++ b/src/misc.h @@ -38,11 +38,6 @@ namespace Stockfish { std::string engine_info(bool to_uci = false); std::string compiler_info(); -// Preloads the given address in L1/L2 cache. This is a non-blocking -// function that doesn't stall the CPU waiting for data to be loaded from memory, -// which can be quite slow. -void prefetch(const void* addr); - void start_logger(const std::string& fname); size_t str_to_size_t(const std::string& s); diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 59a6149f0c4..c5cf5a40cc7 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -16,7 +16,17 @@ along with this program. If not, see . */ -// Definition of layer AffineTransform of NNUE evaluation function +// affine_transform.h contains the definition of AffineTransform layer. +// +// Following function(s) must be implemented in the architecture-specific +// files: +// +// AffineTransform::propagate +// AffineTransform::get_weight_index +// +// Following class(es) must be defined in the architecture-specific files: +// +// AffineTransformSparseInput #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED @@ -24,105 +34,10 @@ #include #include -#include "../nnue_common.h" -#include "simd.h" - -/* - This file contains the definition for a fully connected layer (aka affine transform). - - - expected use-case is for when PaddedInputDimensions == 32 and InputDimensions <= 32. - - that's why AVX512 is hard to implement - - expected use-case is small layers - - inputs are processed in chunks of 4, weights are respectively transposed - - accumulation happens directly to int32s -*/ +#include "nnue/nnue_common.h" namespace Stockfish::Eval::NNUE::Layers { -#if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD) - #define ENABLE_SEQ_OPT -#endif - -// Fallback implementation for older/other architectures. -// Requires the input to be padded to at least 16 values. -#ifndef ENABLE_SEQ_OPT - -template -static void affine_transform_non_ssse3(std::int32_t* output, - const std::int8_t* weights, - const std::int32_t* biases, - const std::uint8_t* input) { - #if defined(USE_SSE2) || defined(USE_NEON) - #if defined(USE_SSE2) - // At least a multiple of 16, with SSE2. - constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; - const __m128i Zeros = _mm_setzero_si128(); - const auto inputVector = reinterpret_cast(input); - - #elif defined(USE_NEON) - constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 16) / 16; - const auto inputVector = reinterpret_cast(input); - #endif - - for (IndexType i = 0; i < OutputDimensions; ++i) - { - const IndexType offset = i * PaddedInputDimensions; - - #if defined(USE_SSE2) - __m128i sumLo = _mm_cvtsi32_si128(biases[i]); - __m128i sumHi = Zeros; - const auto row = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumChunks; ++j) - { - __m128i row_j = _mm_load_si128(&row[j]); - __m128i input_j = _mm_load_si128(&inputVector[j]); - __m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8); - __m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8); - __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros); - __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros); - __m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo); - __m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi); - sumLo = _mm_add_epi32(sumLo, productLo); - sumHi = _mm_add_epi32(sumHi, productHi); - } - __m128i sum = _mm_add_epi32(sumLo, sumHi); - __m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); - sum = _mm_add_epi32(sum, sumHigh_64); - __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); - sum = _mm_add_epi32(sum, sum_second_32); - output[i] = _mm_cvtsi128_si32(sum); - - #elif defined(USE_NEON) - - int32x4_t sum = {biases[i]}; - const auto row = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumChunks; ++j) - { - int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]); - product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]); - sum = vpadalq_s16(sum, product); - } - output[i] = sum[0] + sum[1] + sum[2] + sum[3]; - - #endif - } - #else - std::memcpy(output, biases, sizeof(std::int32_t) * OutputDimensions); - - // Traverse weights in transpose order to take advantage of input sparsity - for (IndexType i = 0; i < InputDimensions; ++i) - if (input[i]) - { - const std::int8_t* w = &weights[i]; - const int in = input[i]; - for (IndexType j = 0; j < OutputDimensions; ++j) - output[j] += w[j * PaddedInputDimensions] * in; - } - #endif -} - -#endif // !ENABLE_SEQ_OPT - template class AffineTransform { public: @@ -133,11 +48,12 @@ class AffineTransform { // Number of input/output dimensions static constexpr IndexType InputDimensions = InDims; static constexpr IndexType OutputDimensions = OutDims; + static_assert(InputDimensions > 0 && OutputDimensions > 0); static constexpr IndexType PaddedInputDimensions = - ceil_to_multiple(InputDimensions, MaxSimdWidth); + ceil_to_multiple(InputDimensions, DimensionPadding); static constexpr IndexType PaddedOutputDimensions = - ceil_to_multiple(OutputDimensions, MaxSimdWidth); + ceil_to_multiple(OutputDimensions, DimensionPadding); using OutputBuffer = OutputType[PaddedOutputDimensions]; @@ -150,19 +66,6 @@ class AffineTransform { return hashValue; } - static constexpr IndexType get_weight_index_scrambled(IndexType i) { - return (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 - + i / PaddedInputDimensions * 4 + i % 4; - } - - static constexpr IndexType get_weight_index(IndexType i) { -#ifdef ENABLE_SEQ_OPT - return get_weight_index_scrambled(i); -#else - return i; -#endif - } - // Read network parameters bool read_parameters(std::istream& stream) { read_little_endian(stream, biases, OutputDimensions); @@ -181,126 +84,51 @@ class AffineTransform { return !stream.fail(); } + // Forward propagation - void propagate(const InputType* input, OutputType* output) const { - -#ifdef ENABLE_SEQ_OPT - - if constexpr (OutputDimensions > 1) - { - #if defined(USE_AVX512) - using vec_t = __m512i; - #define vec_set_32 _mm512_set1_epi32 - #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32 - #elif defined(USE_AVX2) - using vec_t = __m256i; - #define vec_set_32 _mm256_set1_epi32 - #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 - #elif defined(USE_SSSE3) - using vec_t = __m128i; - #define vec_set_32 _mm_set1_epi32 - #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 - #elif defined(USE_NEON_DOTPROD) - using vec_t = int32x4_t; - #define vec_set_32 vdupq_n_s32 - #define vec_add_dpbusd_32(acc, a, b) \ - Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \ - vreinterpretq_s8_s32(b)) - #endif - - static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); - - static_assert(OutputDimensions % OutputSimdWidth == 0); - - constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; - constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; - - const auto input32 = reinterpret_cast(input); - const vec_t* biasvec = reinterpret_cast(biases); - vec_t acc[NumRegs]; - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = biasvec[k]; - - for (IndexType i = 0; i < NumChunks; ++i) - { - const vec_t in0 = vec_set_32(input32[i]); - const auto col0 = - reinterpret_cast(&weights[i * OutputDimensions * 4]); - - for (IndexType k = 0; k < NumRegs; ++k) - vec_add_dpbusd_32(acc[k], in0, col0[k]); - } - - vec_t* outptr = reinterpret_cast(output); - for (IndexType k = 0; k < NumRegs; ++k) - outptr[k] = acc[k]; - - #undef vec_set_32 - #undef vec_add_dpbusd_32 - } - else if constexpr (OutputDimensions == 1) - { - // We cannot use AVX512 for the last layer because there are only 32 inputs - // and the buffer is not padded to 64 elements. - #if defined(USE_AVX2) - using vec_t = __m256i; - #define vec_setzero() _mm256_setzero_si256() - #define vec_set_32 _mm256_set1_epi32 - #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 - #define vec_hadd Simd::m256_hadd - #elif defined(USE_SSSE3) - using vec_t = __m128i; - #define vec_setzero() _mm_setzero_si128() - #define vec_set_32 _mm_set1_epi32 - #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 - #define vec_hadd Simd::m128_hadd - #elif defined(USE_NEON_DOTPROD) - using vec_t = int32x4_t; - #define vec_setzero() vdupq_n_s32(0) - #define vec_set_32 vdupq_n_s32 - #define vec_add_dpbusd_32(acc, a, b) \ - Simd::dotprod_m128_add_dpbusd_epi32(acc, vreinterpretq_s8_s32(a), \ - vreinterpretq_s8_s32(b)) - #define vec_hadd Simd::neon_m128_hadd - #endif - - const auto inputVector = reinterpret_cast(input); - - static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType); - - static_assert(PaddedInputDimensions % InputSimdWidth == 0); - - constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth; - vec_t sum0 = vec_setzero(); - const auto row0 = reinterpret_cast(&weights[0]); - - for (int j = 0; j < int(NumChunks); ++j) - { - const vec_t in = inputVector[j]; - vec_add_dpbusd_32(sum0, in, row0[j]); - } - output[0] = vec_hadd(sum0, biases[0]); - - #undef vec_setzero - #undef vec_set_32 - #undef vec_add_dpbusd_32 - #undef vec_hadd - } -#else - // Use old implementation for the other architectures. - affine_transform_non_ssse3( - output, weights, biases, input); -#endif - } + void propagate(const InputType* input, OutputType* output) const; - private: + protected: using BiasType = OutputType; using WeightType = std::int8_t; + static constexpr IndexType get_weight_index(IndexType i); + alignas(CacheLineSize) BiasType biases[OutputDimensions]; alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; }; } // namespace Stockfish::Eval::NNUE::Layers -#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED +// This macro is used to inherit types and constexpr values from +// AffineTransform class in case implementation details define specialized +// AffineTransformSparseInput class. +#define __DEFINE_BASE_PROPERTIES \ + using Base = AffineTransform; \ + using Base::biases, Base::weights; \ +\ + public: \ + using typename Base::InputType, typename Base::OutputType, typename Base::OutputBuffer; \ + using typename Base::BiasType, typename Base::WeightType; \ + using Base::InputDimensions, Base::OutputDimensions, Base::PaddedInputDimensions, \ + Base::PaddedOutputDimensions; + +#if defined(__i386__) || defined(__amd64__) + + #include "arch/i386/nnue/layers/affine_transform.h" + #include "arch/i386/nnue/layers/affine_transform_sparse_input.h" + +#elif defined(__arm__) || defined(__aarch64__) + + #include "arch/arm/nnue/layers/affine_transform.h" + #include "arch/arm/nnue/layers/affine_transform_sparse_input.h" + +#else + + #include "arch/generic/nnue/layers/affine_transform.h" + +#endif + +#undef __DEFINE_BASE_PROPERTIES + +#endif // NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED diff --git a/src/nnue/layers/affine_transform_sparse_input.h b/src/nnue/layers/affine_transform_sparse_input.h deleted file mode 100644 index 0ac557abac2..00000000000 --- a/src/nnue/layers/affine_transform_sparse_input.h +++ /dev/null @@ -1,278 +0,0 @@ -/* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) - - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -// Definition of layer AffineTransformSparseInput of NNUE evaluation function - -#ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED -#define NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED - -#include -#include -#include -#include - -#include "../../bitboard.h" -#include "../nnue_common.h" -#include "affine_transform.h" -#include "simd.h" - -/* - This file contains the definition for a fully connected layer (aka affine transform) with block sparse input. -*/ - -namespace Stockfish::Eval::NNUE::Layers { - -#if (USE_SSSE3 | (USE_NEON >= 8)) -alignas(CacheLineSize) static inline const - std::array, 256> lookup_indices = []() { - std::array, 256> v{}; - for (unsigned i = 0; i < 256; ++i) - { - std::uint64_t j = i, k = 0; - while (j) - v[i][k++] = pop_lsb(j); - } - return v; - }(); - -// Find indices of nonzero numbers in an int32_t array -template -void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) { - #if defined(USE_SSSE3) - #if defined(USE_AVX512) - using vec_t = __m512i; - #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512()) - #elif defined(USE_AVX2) - using vec_t = __m256i; - #if defined(USE_VNNI) && !defined(USE_AVXVNNI) - #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256()) - #else - #define vec_nnz(a) \ - _mm256_movemask_ps( \ - _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256()))) - #endif - #elif defined(USE_SSSE3) - using vec_t = __m128i; - #define vec_nnz(a) \ - _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128()))) - #endif - using vec128_t = __m128i; - #define vec128_zero _mm_setzero_si128() - #define vec128_set_16(a) _mm_set1_epi16(a) - #define vec128_load(a) _mm_load_si128(a) - #define vec128_storeu(a, b) _mm_storeu_si128(a, b) - #define vec128_add(a, b) _mm_add_epi16(a, b) - #elif defined(USE_NEON) - using vec_t = uint32x4_t; - static const std::uint32_t Mask[4] = {1, 2, 4, 8}; - #define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask))) - using vec128_t = uint16x8_t; - #define vec128_zero vdupq_n_u16(0) - #define vec128_set_16(a) vdupq_n_u16(a) - #define vec128_load(a) vld1q_u16(reinterpret_cast(a)) - #define vec128_storeu(a, b) vst1q_u16(reinterpret_cast(a), b) - #define vec128_add(a, b) vaddq_u16(a, b) - #endif - constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t); - // Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8) - constexpr IndexType ChunkSize = std::max(InputSimdWidth, 8); - constexpr IndexType NumChunks = InputDimensions / ChunkSize; - constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth; - constexpr IndexType OutputsPerChunk = ChunkSize / 8; - - const auto inputVector = reinterpret_cast(input); - IndexType count = 0; - vec128_t base = vec128_zero; - const vec128_t increment = vec128_set_16(8); - for (IndexType i = 0; i < NumChunks; ++i) - { - // bitmask of nonzero values in this chunk - unsigned nnz = 0; - for (IndexType j = 0; j < InputsPerChunk; ++j) - { - const vec_t inputChunk = inputVector[i * InputsPerChunk + j]; - nnz |= unsigned(vec_nnz(inputChunk)) << (j * InputSimdWidth); - } - for (IndexType j = 0; j < OutputsPerChunk; ++j) - { - const auto lookup = (nnz >> (j * 8)) & 0xFF; - const auto offsets = - vec128_load(reinterpret_cast(&lookup_indices[lookup])); - vec128_storeu(reinterpret_cast(out + count), vec128_add(base, offsets)); - count += popcount(lookup); - base = vec128_add(base, increment); - } - } - count_out = count; -} - #undef vec_nnz - #undef vec128_zero - #undef vec128_set_16 - #undef vec128_load - #undef vec128_storeu - #undef vec128_add -#endif - -// Sparse input implementation -template -class AffineTransformSparseInput { - public: - // Input/output type - using InputType = std::uint8_t; - using OutputType = std::int32_t; - - // Number of input/output dimensions - static constexpr IndexType InputDimensions = InDims; - static constexpr IndexType OutputDimensions = OutDims; - - static_assert(OutputDimensions % 16 == 0, - "Only implemented for OutputDimensions divisible by 16."); - - static constexpr IndexType PaddedInputDimensions = - ceil_to_multiple(InputDimensions, MaxSimdWidth); - static constexpr IndexType PaddedOutputDimensions = - ceil_to_multiple(OutputDimensions, MaxSimdWidth); - -#if (USE_SSSE3 | (USE_NEON >= 8)) - static constexpr IndexType ChunkSize = 4; -#else - static constexpr IndexType ChunkSize = 1; -#endif - - using OutputBuffer = OutputType[PaddedOutputDimensions]; - - // Hash value embedded in the evaluation file - static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { - std::uint32_t hashValue = 0xCC03DAE4u; - hashValue += OutputDimensions; - hashValue ^= prevHash >> 1; - hashValue ^= prevHash << 31; - return hashValue; - } - - static constexpr IndexType get_weight_index_scrambled(IndexType i) { - return (i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize - + i / PaddedInputDimensions * ChunkSize + i % ChunkSize; - } - - static constexpr IndexType get_weight_index(IndexType i) { -#if (USE_SSSE3 | (USE_NEON >= 8)) - return get_weight_index_scrambled(i); -#else - return i; -#endif - } - - // Read network parameters - bool read_parameters(std::istream& stream) { - read_little_endian(stream, biases, OutputDimensions); - for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) - weights[get_weight_index(i)] = read_little_endian(stream); - - return !stream.fail(); - } - - // Write network parameters - bool write_parameters(std::ostream& stream) const { - write_little_endian(stream, biases, OutputDimensions); - - for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) - write_little_endian(stream, weights[get_weight_index(i)]); - - return !stream.fail(); - } - // Forward propagation - void propagate(const InputType* input, OutputType* output) const { - -#if (USE_SSSE3 | (USE_NEON >= 8)) - #if defined(USE_AVX512) - using invec_t = __m512i; - using outvec_t = __m512i; - #define vec_set_32 _mm512_set1_epi32 - #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32 - #elif defined(USE_AVX2) - using invec_t = __m256i; - using outvec_t = __m256i; - #define vec_set_32 _mm256_set1_epi32 - #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 - #elif defined(USE_SSSE3) - using invec_t = __m128i; - using outvec_t = __m128i; - #define vec_set_32 _mm_set1_epi32 - #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 - #elif defined(USE_NEON_DOTPROD) - using invec_t = int8x16_t; - using outvec_t = int32x4_t; - #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a)) - #define vec_add_dpbusd_32 Simd::dotprod_m128_add_dpbusd_epi32 - #elif defined(USE_NEON) - using invec_t = int8x16_t; - using outvec_t = int32x4_t; - #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a)) - #define vec_add_dpbusd_32 Simd::neon_m128_add_dpbusd_epi32 - #endif - static constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType); - - constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / ChunkSize; - constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; - std::uint16_t nnz[NumChunks]; - IndexType count; - - const auto input32 = reinterpret_cast(input); - - // Find indices of nonzero 32-bit blocks - find_nnz(input32, nnz, count); - - const outvec_t* biasvec = reinterpret_cast(biases); - outvec_t acc[NumRegs]; - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = biasvec[k]; - - for (IndexType j = 0; j < count; ++j) - { - const auto i = nnz[j]; - const invec_t in = vec_set_32(input32[i]); - const auto col = - reinterpret_cast(&weights[i * OutputDimensions * ChunkSize]); - for (IndexType k = 0; k < NumRegs; ++k) - vec_add_dpbusd_32(acc[k], in, col[k]); - } - - outvec_t* outptr = reinterpret_cast(output); - for (IndexType k = 0; k < NumRegs; ++k) - outptr[k] = acc[k]; - #undef vec_set_32 - #undef vec_add_dpbusd_32 -#else - // Use dense implementation for the other architectures. - affine_transform_non_ssse3( - output, weights, biases, input); -#endif - } - - private: - using BiasType = OutputType; - using WeightType = std::int8_t; - - alignas(CacheLineSize) BiasType biases[OutputDimensions]; - alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; -}; - -} // namespace Stockfish::Eval::NNUE::Layers - -#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 2ee378ad881..daf3b5c3909 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -16,20 +16,23 @@ along with this program. If not, see . */ -// Definition of layer ClippedReLU of NNUE evaluation function +// clipped_relu.h contains the definition of ClippedReLU layer. +// +// Following function(s) must be implemented in the architecture-specific +// files: +// +// ClippedReLU::propagate #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED -#include #include #include -#include "../nnue_common.h" +#include "nnue/nnue_common.h" namespace Stockfish::Eval::NNUE::Layers { -// Clipped ReLU template class ClippedReLU { public: @@ -40,8 +43,10 @@ class ClippedReLU { // Number of input/output dimensions static constexpr IndexType InputDimensions = InDims; static constexpr IndexType OutputDimensions = InputDimensions; + static_assert(InputDimensions > 0); + static constexpr IndexType PaddedOutputDimensions = - ceil_to_multiple(OutputDimensions, 32); + ceil_to_multiple(OutputDimensions, DimensionPadding); using OutputBuffer = OutputType[PaddedOutputDimensions]; @@ -59,106 +64,23 @@ class ClippedReLU { bool write_parameters(std::ostream&) const { return true; } // Forward propagation - void propagate(const InputType* input, OutputType* output) const { - -#if defined(USE_AVX2) - if constexpr (InputDimensions % SimdWidth == 0) - { - constexpr IndexType NumChunks = InputDimensions / SimdWidth; - const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast<__m256i*>(output); - for (IndexType i = 0; i < NumChunks; ++i) - { - const __m256i words0 = - _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 0]), - _mm256_load_si256(&in[i * 4 + 1])), - WeightScaleBits); - const __m256i words1 = - _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 2]), - _mm256_load_si256(&in[i * 4 + 3])), - WeightScaleBits); - _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32( - _mm256_packs_epi16(words0, words1), Offsets)); - } - } - else - { - constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2); - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast<__m128i*>(output); - for (IndexType i = 0; i < NumChunks; ++i) - { - const __m128i words0 = _mm_srli_epi16( - _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), - WeightScaleBits); - const __m128i words1 = _mm_srli_epi16( - _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), - WeightScaleBits); - _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1)); - } - } - constexpr IndexType Start = InputDimensions % SimdWidth == 0 - ? InputDimensions / SimdWidth * SimdWidth - : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2); - -#elif defined(USE_SSE2) - constexpr IndexType NumChunks = InputDimensions / SimdWidth; - - #ifndef USE_SSE41 - const __m128i k0x80s = _mm_set1_epi8(-128); - #endif - - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast<__m128i*>(output); - for (IndexType i = 0; i < NumChunks; ++i) - { - #if defined(USE_SSE41) - const __m128i words0 = _mm_srli_epi16( - _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), - WeightScaleBits); - const __m128i words1 = _mm_srli_epi16( - _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), - WeightScaleBits); - _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1)); - #else - const __m128i words0 = _mm_srai_epi16( - _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])), - WeightScaleBits); - const __m128i words1 = _mm_srai_epi16( - _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])), - WeightScaleBits); - const __m128i packedbytes = _mm_packs_epi16(words0, words1); - _mm_store_si128(&out[i], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)); - #endif - } - constexpr IndexType Start = NumChunks * SimdWidth; - -#elif defined(USE_NEON) - constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2); - const int8x8_t Zero = {0}; - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast(output); - for (IndexType i = 0; i < NumChunks; ++i) - { - int16x8_t shifted; - const auto pack = reinterpret_cast(&shifted); - pack[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits); - pack[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits); - out[i] = vmax_s8(vqmovn_s16(shifted), Zero); - } - constexpr IndexType Start = NumChunks * (SimdWidth / 2); -#else - constexpr IndexType Start = 0; -#endif - - for (IndexType i = Start; i < InputDimensions; ++i) - { - output[i] = static_cast(std::clamp(input[i] >> WeightScaleBits, 0, 127)); - } - } + void propagate(const InputType* input, OutputType* output) const; }; } // namespace Stockfish::Eval::NNUE::Layers +#if defined(__i386__) || defined(__amd64__) + + #include "arch/i386/nnue/layers/clipped_relu.h" + +#elif defined(__arm__) || defined(__aarch64__) + + #include "arch/arm/nnue/layers/clipped_relu.h" + +#else + + #include "arch/generic/nnue/layers/clipped_relu.h" + +#endif + #endif // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED diff --git a/src/nnue/layers/simd.h b/src/nnue/layers/simd.h deleted file mode 100644 index 55cb7df1421..00000000000 --- a/src/nnue/layers/simd.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) - - Stockfish is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Stockfish is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef STOCKFISH_SIMD_H_INCLUDED -#define STOCKFISH_SIMD_H_INCLUDED - -#if defined(USE_AVX2) - #include - -#elif defined(USE_SSE41) - #include - -#elif defined(USE_SSSE3) - #include - -#elif defined(USE_SSE2) - #include - -#elif defined(USE_NEON) - #include -#endif - -namespace Stockfish::Simd { - -#if defined(USE_AVX512) - -[[maybe_unused]] static int m512_hadd(__m512i sum, int bias) { - return _mm512_reduce_add_epi32(sum) + bias; -} - -[[maybe_unused]] static void m512_add_dpbusd_epi32(__m512i& acc, __m512i a, __m512i b) { - - #if defined(USE_VNNI) - acc = _mm512_dpbusd_epi32(acc, a, b); - #else - __m512i product0 = _mm512_maddubs_epi16(a, b); - product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1)); - acc = _mm512_add_epi32(acc, product0); - #endif -} - -#endif - -#if defined(USE_AVX2) - -[[maybe_unused]] static int m256_hadd(__m256i sum, int bias) { - __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); - sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); - sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); - return _mm_cvtsi128_si32(sum128) + bias; -} - -[[maybe_unused]] static void m256_add_dpbusd_epi32(__m256i& acc, __m256i a, __m256i b) { - - #if defined(USE_VNNI) - acc = _mm256_dpbusd_epi32(acc, a, b); - #else - __m256i product0 = _mm256_maddubs_epi16(a, b); - product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1)); - acc = _mm256_add_epi32(acc, product0); - #endif -} - -#endif - -#if defined(USE_SSSE3) - -[[maybe_unused]] static int m128_hadd(__m128i sum, int bias) { - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB - return _mm_cvtsi128_si32(sum) + bias; -} - -[[maybe_unused]] static void m128_add_dpbusd_epi32(__m128i& acc, __m128i a, __m128i b) { - - __m128i product0 = _mm_maddubs_epi16(a, b); - product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1)); - acc = _mm_add_epi32(acc, product0); -} - -#endif - -#if defined(USE_NEON_DOTPROD) - -[[maybe_unused]] static void -dotprod_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) { - - acc = vdotq_s32(acc, a, b); -} -#endif - -#if defined(USE_NEON) - -[[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) { - #if USE_NEON >= 8 - return vaddvq_s32(s); - #else - return s[0] + s[1] + s[2] + s[3]; - #endif -} - -[[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) { - return neon_m128_reduce_add_epi32(sum) + bias; -} - -#endif - -#if USE_NEON >= 8 -[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(int32x4_t& acc, int8x16_t a, int8x16_t b) { - - int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b)); - int16x8_t product1 = vmull_high_s8(a, b); - int16x8_t sum = vpaddq_s16(product0, product1); - acc = vpadalq_s16(acc, sum); -} -#endif -} - -#endif // STOCKFISH_SIMD_H_INCLUDED diff --git a/src/nnue/layers/sqr_clipped_relu.h b/src/nnue/layers/sqr_clipped_relu.h index 9c20df9d6f5..e755be85377 100644 --- a/src/nnue/layers/sqr_clipped_relu.h +++ b/src/nnue/layers/sqr_clipped_relu.h @@ -16,20 +16,23 @@ along with this program. If not, see . */ -// Definition of layer ClippedReLU of NNUE evaluation function +// sqr_clipped_relu.h contains the definition of SqrClippedReLU layer. +// +// Following function(s) must be implemented in the architecture-specific +// files: +// +// SqrClippedReLU::propagate #ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED #define NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED -#include #include #include -#include "../nnue_common.h" +#include "nnue/nnue_common.h" namespace Stockfish::Eval::NNUE::Layers { -// Clipped ReLU template class SqrClippedReLU { public: @@ -40,8 +43,10 @@ class SqrClippedReLU { // Number of input/output dimensions static constexpr IndexType InputDimensions = InDims; static constexpr IndexType OutputDimensions = InputDimensions; + static_assert(InputDimensions > 0); + static constexpr IndexType PaddedOutputDimensions = - ceil_to_multiple(OutputDimensions, 32); + ceil_to_multiple(OutputDimensions, DimensionPadding); using OutputBuffer = OutputType[PaddedOutputDimensions]; @@ -59,45 +64,23 @@ class SqrClippedReLU { bool write_parameters(std::ostream&) const { return true; } // Forward propagation - void propagate(const InputType* input, OutputType* output) const { - -#if defined(USE_SSE2) - constexpr IndexType NumChunks = InputDimensions / 16; - - static_assert(WeightScaleBits == 6); - const auto in = reinterpret_cast(input); - const auto out = reinterpret_cast<__m128i*>(output); - for (IndexType i = 0; i < NumChunks; ++i) - { - __m128i words0 = - _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])); - __m128i words1 = - _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])); - - // We shift by WeightScaleBits * 2 = 12 and divide by 128 - // which is an additional shift-right of 7, meaning 19 in total. - // MulHi strips the lower 16 bits so we need to shift out 3 more to match. - words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3); - words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3); - - _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1)); - } - constexpr IndexType Start = NumChunks * 16; + void propagate(const InputType* input, OutputType* output) const; +}; + +} // namespace Stockfish::Eval::NNUE::Layers + +#if defined(__i386__) || defined(__amd64__) + + #include "arch/i386/nnue/layers/sqr_clipped_relu.h" + +#elif defined(__arm__) || defined(__aarch64__) + + #include "arch/arm/nnue/layers/sqr_clipped_relu.h" #else - constexpr IndexType Start = 0; -#endif - for (IndexType i = Start; i < InputDimensions; ++i) - { - output[i] = static_cast( - // Really should be /127 but we need to make it fast so we right-shift - // by an extra 7 bits instead. Needs to be accounted for in the trainer. - std::min(127ll, ((long long) (input[i]) * input[i]) >> (2 * WeightScaleBits + 7))); - } - } -}; + #include "arch/generic/nnue/layers/sqr_clipped_relu.h" -} // namespace Stockfish::Eval::NNUE::Layers +#endif #endif // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index f7d2cc6ada0..6f863cc4f84 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -98,7 +98,7 @@ bool read_parameters(std::istream& stream, T& reference) { // Write evaluation function parameters template -bool write_parameters(std::ostream& stream, const T& reference) { +bool write_parameters(std::ostream& stream, T& reference) { write_little_endian(stream, T::get_hash_value()); return reference.write_parameters(stream); @@ -174,7 +174,7 @@ void Network::load(const std::string& rootDirectory, std::str template -bool Network::save(const std::optional& filename) const { +bool Network::save(const std::optional& filename) { std::string actualFilename; std::string msg; @@ -210,17 +210,14 @@ Network::evaluate(const Position& pos AccumulatorCaches::Cache* cache) const { // We manually align the arrays on the stack because with gcc < 9.3 // overaligning stack variables with alignas() doesn't work correctly. - - constexpr uint64_t alignment = CacheLineSize; - #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) TransformedFeatureType transformedFeaturesUnaligned[FeatureTransformer::BufferSize - + alignment / sizeof(TransformedFeatureType)]; + + CacheLineSize / sizeof(TransformedFeatureType)]; - auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); + auto* transformedFeatures = align_ptr_up(&transformedFeaturesUnaligned[0]); #else - alignas(alignment) TransformedFeatureType + alignas(CacheLineSize) TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize]; #endif @@ -360,7 +357,7 @@ void Network::initialize() { template bool Network::save(std::ostream& stream, const std::string& name, - const std::string& netDescription) const { + const std::string& netDescription) { if (name.empty() || name == "None") return false; @@ -410,7 +407,7 @@ bool Network::write_header(std::ostream& stream, template bool Network::read_parameters(std::istream& stream, - std::string& netDescription) const { + std::string& netDescription) { std::uint32_t hashValue; if (!read_header(stream, &hashValue, &netDescription)) return false; @@ -429,7 +426,7 @@ bool Network::read_parameters(std::istream& stream, template bool Network::write_parameters(std::ostream& stream, - const std::string& netDescription) const { + const std::string& netDescription) { if (!write_header(stream, Network::hash, netDescription)) return false; if (!Detail::write_parameters(stream, *featureTransformer)) @@ -444,12 +441,8 @@ bool Network::write_parameters(std::ostream& stream, // Explicit template instantiation -template class Network< - NetworkArchitecture, - FeatureTransformer>; +template class Network; -template class Network< - NetworkArchitecture, - FeatureTransformer>; +template class Network; } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/network.h b/src/nnue/network.h index 152082552c9..6008b11ddfe 100644 --- a/src/nnue/network.h +++ b/src/nnue/network.h @@ -59,7 +59,7 @@ class Network { Network& operator=(Network&& other) = default; void load(const std::string& rootDirectory, std::string evalfilePath); - bool save(const std::optional& filename) const; + bool save(const std::optional& filename); NetworkOutput evaluate(const Position& pos, AccumulatorCaches::Cache* cache) const; @@ -78,14 +78,14 @@ class Network { void initialize(); - bool save(std::ostream&, const std::string&, const std::string&) const; + bool save(std::ostream&, const std::string&, const std::string&); std::optional load(std::istream&); bool read_header(std::istream&, std::uint32_t*, std::string*) const; bool write_header(std::ostream&, std::uint32_t, const std::string&) const; - bool read_parameters(std::istream&, std::string&) const; - bool write_parameters(std::ostream&, const std::string&) const; + bool read_parameters(std::istream&, std::string&); + bool write_parameters(std::ostream&, const std::string&); // Input feature converter LargePagePtr featureTransformer; diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index b8dcf1e480f..be25f3f7aa6 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -21,6 +21,7 @@ #ifndef NNUE_ACCUMULATOR_H_INCLUDED #define NNUE_ACCUMULATOR_H_INCLUDED +#include #include #include "nnue_architecture.h" @@ -28,10 +29,6 @@ namespace Stockfish::Eval::NNUE { -using BiasType = std::int16_t; -using PSQTWeightType = std::int32_t; -using IndexType = std::uint32_t; - // Class that holds the result of affine transformation of input features template struct alignas(CacheLineSize) Accumulator { @@ -56,6 +53,8 @@ struct AccumulatorCaches { template struct alignas(CacheLineSize) Cache { + using BiasType = FeatureTransformerBiasType; + using PSQTWeightType = FeatureTransformerPSQTWeightType; struct alignas(CacheLineSize) Entry { BiasType accumulation[Size]; diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h index 7f73f87fd5e..ea443b38ed2 100644 --- a/src/nnue/nnue_architecture.h +++ b/src/nnue/nnue_architecture.h @@ -27,7 +27,6 @@ #include "features/half_ka_v2_hm.h" #include "layers/affine_transform.h" -#include "layers/affine_transform_sparse_input.h" #include "layers/clipped_relu.h" #include "layers/sqr_clipped_relu.h" #include "nnue_common.h" diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index 4bc3408f18a..774be5093e2 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -16,7 +16,7 @@ along with this program. If not, see . */ -// Constants used in NNUE evaluation function +// Common constants and functions used in NNUE evaluation function #ifndef NNUE_COMMON_H_INCLUDED #define NNUE_COMMON_H_INCLUDED @@ -28,55 +28,32 @@ #include #include -#include "../misc.h" - -#if defined(USE_AVX2) - #include - -#elif defined(USE_SSE41) - #include - -#elif defined(USE_SSSE3) - #include - -#elif defined(USE_SSE2) - #include - -#elif defined(USE_NEON) - #include -#endif +#include "misc.h" namespace Stockfish::Eval::NNUE { // Version of the evaluation file constexpr std::uint32_t Version = 0x7AF32F20u; -// Constant used in evaluation value calculation -constexpr int OutputScale = 16; -constexpr int WeightScaleBits = 6; +using IndexType = std::uint32_t; +using TransformedFeatureType = std::uint8_t; + +// Types used in the feature transformer and accumulator cache entries +using FeatureTransformerBiasType = std::int16_t; +using FeatureTransformerPSQTWeightType = std::int32_t; // Size of cache line (in bytes) constexpr std::size_t CacheLineSize = 64; -constexpr const char Leb128MagicString[] = "COMPRESSED_LEB128"; -constexpr const std::size_t Leb128MagicStringSize = sizeof(Leb128MagicString) - 1; - -// SIMD width (in bytes) -#if defined(USE_AVX2) -constexpr std::size_t SimdWidth = 32; +// Padding to dimensions of input/output buffers across layers +constexpr std::size_t DimensionPadding = 32; -#elif defined(USE_SSE2) -constexpr std::size_t SimdWidth = 16; - -#elif defined(USE_NEON) -constexpr std::size_t SimdWidth = 16; -#endif - -constexpr std::size_t MaxSimdWidth = 32; +// Constant used in evaluation value calculation +constexpr int OutputScale = 16; +constexpr int WeightScaleBits = 6; -// Type of input feature after conversion -using TransformedFeatureType = std::uint8_t; -using IndexType = std::uint32_t; +constexpr const char Leb128MagicString[] = "COMPRESSED_LEB128"; +constexpr std::size_t Leb128MagicStringSize = sizeof(Leb128MagicString) - 1; // Round n up to be a multiple of base template @@ -84,6 +61,26 @@ constexpr IntType ceil_to_multiple(IntType n, IntType base) { return (n + base - 1) / base * base; } +template +constexpr int optimal_register_count() { + static_assert(RegisterSize > 0 && LaneSize > 0 && NumLanes > 0); + static_assert(RegisterSize >= LaneSize && RegisterSize % LaneSize == 0); + static_assert((NumLanes * LaneSize) % RegisterSize == 0); + + // The exact number of registers that can fit in the whole input vectors. + constexpr int Ideal = (NumLanes * LaneSize) / RegisterSize; + + if constexpr (Ideal <= NumRegisters) + return Ideal; + + // Look for the largest divisor of the ideal register count that is + // smaller than NumRegisters. + for (int divisor = NumRegisters; divisor > 1; --divisor) + if (Ideal % divisor == 0) + return divisor; + + return 1; +} // Utility to read an integer (signed or unsigned, any size) // from a stream in little-endian order. We swap the byte order after the read if diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index fa180678d89..70953368cc3 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -16,7 +16,16 @@ along with this program. If not, see . */ -// A class that converts the input features of the NNUE evaluation function +// nnue_feature_transformer.h contains the definition of FeatureTransformer +// class, which converts a position into NNUE input features. +// +// Following function(s) must be implemented in the architecture-specific +// files: +// +// FeatureTransformer::permute_weights +// FeatureTransformer::apply_accumulator_updates_incremental +// FeatureTransformer::apply_accumulator_updates_refresh_cache +// FeatureTransformer::convert_accumulators #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED @@ -24,190 +33,23 @@ #include #include #include -#include #include #include -#include "../position.h" -#include "../types.h" -#include "nnue_accumulator.h" -#include "nnue_architecture.h" -#include "nnue_common.h" +#include "position.h" +#include "types.h" +#include "nnue/nnue_accumulator.h" +#include "nnue/nnue_architecture.h" +#include "nnue/nnue_common.h" namespace Stockfish::Eval::NNUE { -using BiasType = std::int16_t; -using WeightType = std::int16_t; -using PSQTWeightType = std::int32_t; - -// If vector instructions are enabled, we update and refresh the -// accumulator tile by tile such that each tile fits in the CPU's -// vector registers. -#define VECTOR - -static_assert(PSQTBuckets % 8 == 0, - "Per feature PSQT values cannot be processed at granularity lower than 8 at a time."); - -#ifdef USE_AVX512 -using vec_t = __m512i; -using psqt_vec_t = __m256i; - #define vec_load(a) _mm512_load_si512(a) - #define vec_store(a, b) _mm512_store_si512(a, b) - #define vec_add_16(a, b) _mm512_add_epi16(a, b) - #define vec_sub_16(a, b) _mm512_sub_epi16(a, b) - #define vec_mulhi_16(a, b) _mm512_mulhi_epi16(a, b) - #define vec_zero() _mm512_setzero_epi32() - #define vec_set_16(a) _mm512_set1_epi16(a) - #define vec_max_16(a, b) _mm512_max_epi16(a, b) - #define vec_min_16(a, b) _mm512_min_epi16(a, b) - #define vec_slli_16(a, b) _mm512_slli_epi16(a, b) - // Inverse permuted at load time - #define vec_packus_16(a, b) _mm512_packus_epi16(a, b) - #define vec_load_psqt(a) _mm256_load_si256(a) - #define vec_store_psqt(a, b) _mm256_store_si256(a, b) - #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b) - #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b) - #define vec_zero_psqt() _mm256_setzero_si256() - #define NumRegistersSIMD 16 - #define MaxChunkSize 64 - -#elif USE_AVX2 -using vec_t = __m256i; -using psqt_vec_t = __m256i; - #define vec_load(a) _mm256_load_si256(a) - #define vec_store(a, b) _mm256_store_si256(a, b) - #define vec_add_16(a, b) _mm256_add_epi16(a, b) - #define vec_sub_16(a, b) _mm256_sub_epi16(a, b) - #define vec_mulhi_16(a, b) _mm256_mulhi_epi16(a, b) - #define vec_zero() _mm256_setzero_si256() - #define vec_set_16(a) _mm256_set1_epi16(a) - #define vec_max_16(a, b) _mm256_max_epi16(a, b) - #define vec_min_16(a, b) _mm256_min_epi16(a, b) - #define vec_slli_16(a, b) _mm256_slli_epi16(a, b) - // Inverse permuted at load time - #define vec_packus_16(a, b) _mm256_packus_epi16(a, b) - #define vec_load_psqt(a) _mm256_load_si256(a) - #define vec_store_psqt(a, b) _mm256_store_si256(a, b) - #define vec_add_psqt_32(a, b) _mm256_add_epi32(a, b) - #define vec_sub_psqt_32(a, b) _mm256_sub_epi32(a, b) - #define vec_zero_psqt() _mm256_setzero_si256() - #define NumRegistersSIMD 16 - #define MaxChunkSize 32 - -#elif USE_SSE2 -using vec_t = __m128i; -using psqt_vec_t = __m128i; - #define vec_load(a) (*(a)) - #define vec_store(a, b) *(a) = (b) - #define vec_add_16(a, b) _mm_add_epi16(a, b) - #define vec_sub_16(a, b) _mm_sub_epi16(a, b) - #define vec_mulhi_16(a, b) _mm_mulhi_epi16(a, b) - #define vec_zero() _mm_setzero_si128() - #define vec_set_16(a) _mm_set1_epi16(a) - #define vec_max_16(a, b) _mm_max_epi16(a, b) - #define vec_min_16(a, b) _mm_min_epi16(a, b) - #define vec_slli_16(a, b) _mm_slli_epi16(a, b) - #define vec_packus_16(a, b) _mm_packus_epi16(a, b) - #define vec_load_psqt(a) (*(a)) - #define vec_store_psqt(a, b) *(a) = (b) - #define vec_add_psqt_32(a, b) _mm_add_epi32(a, b) - #define vec_sub_psqt_32(a, b) _mm_sub_epi32(a, b) - #define vec_zero_psqt() _mm_setzero_si128() - #define NumRegistersSIMD (Is64Bit ? 16 : 8) - #define MaxChunkSize 16 - -#elif USE_NEON -using vec_t = int16x8_t; -using psqt_vec_t = int32x4_t; - #define vec_load(a) (*(a)) - #define vec_store(a, b) *(a) = (b) - #define vec_add_16(a, b) vaddq_s16(a, b) - #define vec_sub_16(a, b) vsubq_s16(a, b) - #define vec_mulhi_16(a, b) vqdmulhq_s16(a, b) - #define vec_zero() \ - vec_t { 0 } - #define vec_set_16(a) vdupq_n_s16(a) - #define vec_max_16(a, b) vmaxq_s16(a, b) - #define vec_min_16(a, b) vminq_s16(a, b) - #define vec_slli_16(a, b) vshlq_s16(a, vec_set_16(b)) - #define vec_packus_16(a, b) reinterpret_cast(vcombine_u8(vqmovun_s16(a), vqmovun_s16(b))) - #define vec_load_psqt(a) (*(a)) - #define vec_store_psqt(a, b) *(a) = (b) - #define vec_add_psqt_32(a, b) vaddq_s32(a, b) - #define vec_sub_psqt_32(a, b) vsubq_s32(a, b) - #define vec_zero_psqt() \ - psqt_vec_t { 0 } - #define NumRegistersSIMD 16 - #define MaxChunkSize 16 - -#else - #undef VECTOR - -#endif - - -#ifdef VECTOR - - // Compute optimal SIMD register count for feature transformer accumulation. - - // We use __m* types as template arguments, which causes GCC to emit warnings - // about losing some attribute information. This is irrelevant to us as we - // only take their size, so the following pragma are harmless. - #if defined(__GNUC__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wignored-attributes" - #endif - -template -static constexpr int BestRegisterCount() { - #define RegisterSize sizeof(SIMDRegisterType) - #define LaneSize sizeof(LaneType) - - static_assert(RegisterSize >= LaneSize); - static_assert(MaxRegisters <= NumRegistersSIMD); - static_assert(MaxRegisters > 0); - static_assert(NumRegistersSIMD > 0); - static_assert(RegisterSize % LaneSize == 0); - static_assert((NumLanes * LaneSize) % RegisterSize == 0); - - const int ideal = (NumLanes * LaneSize) / RegisterSize; - if (ideal <= MaxRegisters) - return ideal; - - // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters - for (int divisor = MaxRegisters; divisor > 1; --divisor) - if (ideal % divisor == 0) - return divisor; - - return 1; -} - #if defined(__GNUC__) - #pragma GCC diagnostic pop - #endif -#endif - - -// Input feature converter template StateInfo::*accPtr> class FeatureTransformer { - // Number of output dimensions for one side static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; - private: -#ifdef VECTOR - static constexpr int NumRegs = - BestRegisterCount(); - static constexpr int NumPsqtRegs = - BestRegisterCount(); - - static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2; - static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4; - static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions"); - static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets"); -#endif - public: // Output type using OutputType = TransformedFeatureType; @@ -224,97 +66,28 @@ class FeatureTransformer { return FeatureSet::HashValue ^ (OutputDimensions * 2); } - static constexpr void order_packs([[maybe_unused]] uint64_t* v) { -#if defined(USE_AVX512) // _mm512_packs_epi16 ordering - uint64_t tmp0 = v[2], tmp1 = v[3]; - v[2] = v[8], v[3] = v[9]; - v[8] = v[4], v[9] = v[5]; - v[4] = tmp0, v[5] = tmp1; - tmp0 = v[6], tmp1 = v[7]; - v[6] = v[10], v[7] = v[11]; - v[10] = v[12], v[11] = v[13]; - v[12] = tmp0, v[13] = tmp1; -#elif defined(USE_AVX2) // _mm256_packs_epi16 ordering - std::swap(v[2], v[4]); - std::swap(v[3], v[5]); -#endif - } - - static constexpr void inverse_order_packs([[maybe_unused]] uint64_t* v) { -#if defined(USE_AVX512) // Inverse _mm512_packs_epi16 ordering - uint64_t tmp0 = v[2], tmp1 = v[3]; - v[2] = v[4], v[3] = v[5]; - v[4] = v[8], v[5] = v[9]; - v[8] = tmp0, v[9] = tmp1; - tmp0 = v[6], tmp1 = v[7]; - v[6] = v[12], v[7] = v[13]; - v[12] = v[10], v[13] = v[11]; - v[10] = tmp0, v[11] = tmp1; -#elif defined(USE_AVX2) // Inverse _mm256_packs_epi16 ordering - std::swap(v[2], v[4]); - std::swap(v[3], v[5]); -#endif - } - - void permute_weights([[maybe_unused]] void (*order_fn)(uint64_t*)) const { -#if defined(USE_AVX2) - #if defined(USE_AVX512) - constexpr IndexType di = 16; - #else - constexpr IndexType di = 8; - #endif - uint64_t* b = reinterpret_cast(const_cast(&biases[0])); - for (IndexType i = 0; i < HalfDimensions * sizeof(BiasType) / sizeof(uint64_t); i += di) - order_fn(&b[i]); - - for (IndexType j = 0; j < InputDimensions; ++j) - { - uint64_t* w = - reinterpret_cast(const_cast(&weights[j * HalfDimensions])); - for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(uint64_t); - i += di) - order_fn(&w[i]); - } -#endif - } - - inline void scale_weights(bool read) const { - for (IndexType j = 0; j < InputDimensions; ++j) - { - WeightType* w = const_cast(&weights[j * HalfDimensions]); - for (IndexType i = 0; i < HalfDimensions; ++i) - w[i] = read ? w[i] * 2 : w[i] / 2; - } - - BiasType* b = const_cast(biases); - for (IndexType i = 0; i < HalfDimensions; ++i) - b[i] = read ? b[i] * 2 : b[i] / 2; - } - // Read network parameters bool read_parameters(std::istream& stream) { - read_leb_128(stream, biases, HalfDimensions); read_leb_128(stream, weights, HalfDimensions * InputDimensions); read_leb_128(stream, psqtWeights, PSQTBuckets * InputDimensions); - permute_weights(inverse_order_packs); - scale_weights(true); + permute_weights(); + scale_weights(); return !stream.fail(); } // Write network parameters - bool write_parameters(std::ostream& stream) const { - - permute_weights(order_packs); - scale_weights(false); + bool write_parameters(std::ostream& stream) { + permute_weights(); + scale_weights(); write_leb_128(stream, biases, HalfDimensions); write_leb_128(stream, weights, HalfDimensions * InputDimensions); write_leb_128(stream, psqtWeights, PSQTBuckets * InputDimensions); - permute_weights(inverse_order_packs); - scale_weights(true); + permute_weights(); + scale_weights(); return !stream.fail(); } @@ -323,135 +96,51 @@ class FeatureTransformer { AccumulatorCaches::Cache* cache, OutputType* output, int bucket) const { + update_accumulator(pos, cache); update_accumulator(pos, cache); + convert_accumulators(pos, output); - const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; const auto& psqtAccumulation = (pos.state()->*accPtr).psqtAccumulation; - const auto psqt = - (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket]) - / 2; - - const auto& accumulation = (pos.state()->*accPtr).accumulation; - for (IndexType p = 0; p < 2; ++p) - { - const IndexType offset = (HalfDimensions / 2) * p; - -#if defined(VECTOR) - - constexpr IndexType OutputChunkSize = MaxChunkSize; - static_assert((HalfDimensions / 2) % OutputChunkSize == 0); - constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize; - - const vec_t Zero = vec_zero(); - const vec_t One = vec_set_16(127 * 2); - - const vec_t* in0 = reinterpret_cast(&(accumulation[perspectives[p]][0])); - const vec_t* in1 = - reinterpret_cast(&(accumulation[perspectives[p]][HalfDimensions / 2])); - vec_t* out = reinterpret_cast(output + offset); - - // Per the NNUE architecture, here we want to multiply pairs of - // clipped elements and divide the product by 128. To do this, - // we can naively perform min/max operation to clip each of the - // four int16 vectors, mullo pairs together, then pack them into - // one int8 vector. However, there exists a faster way. - - // The idea here is to use the implicit clipping from packus to - // save us two vec_max_16 instructions. This clipping works due - // to the fact that any int16 integer below zero will be zeroed - // on packus. - - // Consider the case where the second element is negative. - // If we do standard clipping, that element will be zero, which - // means our pairwise product is zero. If we perform packus and - // remove the lower-side clip for the second element, then our - // product before packus will be negative, and is zeroed on pack. - // The two operation produce equivalent results, but the second - // one (using packus) saves one max operation per pair. - - // But here we run into a problem: mullo does not preserve the - // sign of the multiplication. We can get around this by doing - // mulhi, which keeps the sign. But that requires an additional - // tweak. - - // mulhi cuts off the last 16 bits of the resulting product, - // which is the same as performing a rightward shift of 16 bits. - // We can use this to our advantage. Recall that we want to - // divide the final product by 128, which is equivalent to a - // 7-bit right shift. Intuitively, if we shift the clipped - // value left by 9, and perform mulhi, which shifts the product - // right by 16 bits, then we will net a right shift of 7 bits. - // However, this won't work as intended. Since we clip the - // values to have a maximum value of 127, shifting it by 9 bits - // might occupy the signed bit, resulting in some positive - // values being interpreted as negative after the shift. - - // There is a way, however, to get around this limitation. When - // loading the network, scale accumulator weights and biases by - // 2. To get the same pairwise multiplication result as before, - // we need to divide the product by 128 * 2 * 2 = 512, which - // amounts to a right shift of 9 bits. So now we only have to - // shift left by 7 bits, perform mulhi (shifts right by 16 bits) - // and net a 9 bit right shift. Since we scaled everything by - // two, the values are clipped at 127 * 2 = 254, which occupies - // 8 bits. Shifting it by 7 bits left will no longer occupy the - // signed bit, so we are safe. - - // Note that on NEON processors, we shift left by 6 instead - // because the instruction "vqdmulhq_s16" also doubles the - // return value after the multiplication, adding an extra shift - // to the left by 1, so we compensate by shifting less before - // the multiplication. - - constexpr int shift = - #if defined(USE_SSE2) - 7; - #else - 6; - #endif - - for (IndexType j = 0; j < NumOutputChunks; ++j) - { - const vec_t sum0a = - vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 0], One), Zero), shift); - const vec_t sum0b = - vec_slli_16(vec_max_16(vec_min_16(in0[j * 2 + 1], One), Zero), shift); - const vec_t sum1a = vec_min_16(in1[j * 2 + 0], One); - const vec_t sum1b = vec_min_16(in1[j * 2 + 1], One); + return (psqtAccumulation[pos.side_to_move()][bucket] + - psqtAccumulation[~pos.side_to_move()][bucket]) + / 2; + } - const vec_t pa = vec_mulhi_16(sum0a, sum1a); - const vec_t pb = vec_mulhi_16(sum0b, sum1b); + void hint_common_access(const Position& pos, + AccumulatorCaches::Cache* cache) const { + hint_common_access_for_perspective(pos, cache); + hint_common_access_for_perspective(pos, cache); + } - out[j] = vec_packus_16(pa, pb); - } + private: + using BiasType = FeatureTransformerBiasType; + using WeightType = std::int16_t; + using PSQTWeightType = FeatureTransformerPSQTWeightType; -#else + // Stores constants and types based on the target architecture. + struct Details; - for (IndexType j = 0; j < HalfDimensions / 2; ++j) - { - BiasType sum0 = accumulation[static_cast(perspectives[p])][j + 0]; - BiasType sum1 = - accumulation[static_cast(perspectives[p])][j + HalfDimensions / 2]; - sum0 = std::clamp(sum0, 0, 127 * 2); - sum1 = std::clamp(sum1, 0, 127 * 2); - output[offset + j] = static_cast(unsigned(sum0 * sum1) / 512); - } + template + inline void permute_weights(); -#endif + template + inline void scale_weights() { + for (IndexType j = 0; j < InputDimensions; ++j) + { + WeightType* w = &weights[j * HalfDimensions]; + for (IndexType i = 0; i < HalfDimensions; ++i) + w[i] = Write ? w[i] / 2 : w[i] * 2; } - return psqt; - } // end of function transform() - - void hint_common_access(const Position& pos, - AccumulatorCaches::Cache* cache) const { - hint_common_access_for_perspective(pos, cache); - hint_common_access_for_perspective(pos, cache); + for (IndexType i = 0; i < HalfDimensions; ++i) + biases[i] = Write ? biases[i] / 2 : biases[i] * 2; } - private: + // Look for an accumulator of an earlier position. It traverses the linked + // list of states starting from the current position and goes back until it + // finds a computed accumulator or a state that requires a full refresh. template StateInfo* try_find_computed_accumulator(const Position& pos) const { // Look for a usable accumulator of an earlier position. We keep track @@ -477,13 +166,6 @@ class FeatureTransformer { assert((computed->*accPtr).computed[Perspective]); assert(computed->next != nullptr); -#ifdef VECTOR - // Gcc-10.2 unnecessarily spills AVX2 registers if this array - // is defined in the VECTOR code below, once in each branch. - vec_t acc[NumRegs]; - psqt_vec_t psqt[NumPsqtRegs]; -#endif - const Square ksq = pos.square(Perspective); // The size must be enough to contain the largest possible update. @@ -503,162 +185,8 @@ class FeatureTransformer { StateInfo* next = CurrentOnly ? pos.state() : computed->next; assert(!(next->*accPtr).computed[Perspective]); -#ifdef VECTOR - if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1) - { - auto accIn = - reinterpret_cast(&(computed->*accPtr).accumulation[Perspective][0]); - auto accOut = reinterpret_cast(&(next->*accPtr).accumulation[Perspective][0]); - - const IndexType offsetR0 = HalfDimensions * removed[0]; - auto columnR0 = reinterpret_cast(&weights[offsetR0]); - const IndexType offsetA = HalfDimensions * added[0]; - auto columnA = reinterpret_cast(&weights[offsetA]); - - if (removed.size() == 1) - { - for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) - accOut[i] = vec_add_16(vec_sub_16(accIn[i], columnR0[i]), columnA[i]); - } - else - { - const IndexType offsetR1 = HalfDimensions * removed[1]; - auto columnR1 = reinterpret_cast(&weights[offsetR1]); - - for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) - accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA[i]), - vec_add_16(columnR0[i], columnR1[i])); - } - - auto accPsqtIn = reinterpret_cast( - &(computed->*accPtr).psqtAccumulation[Perspective][0]); - auto accPsqtOut = - reinterpret_cast(&(next->*accPtr).psqtAccumulation[Perspective][0]); - - const IndexType offsetPsqtR0 = PSQTBuckets * removed[0]; - auto columnPsqtR0 = reinterpret_cast(&psqtWeights[offsetPsqtR0]); - const IndexType offsetPsqtA = PSQTBuckets * added[0]; - auto columnPsqtA = reinterpret_cast(&psqtWeights[offsetPsqtA]); - - if (removed.size() == 1) - { - for (std::size_t i = 0; - i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i) - accPsqtOut[i] = vec_add_psqt_32(vec_sub_psqt_32(accPsqtIn[i], columnPsqtR0[i]), - columnPsqtA[i]); - } - else - { - const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; - auto columnPsqtR1 = reinterpret_cast(&psqtWeights[offsetPsqtR1]); - - for (std::size_t i = 0; - i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i) - accPsqtOut[i] = - vec_sub_psqt_32(vec_add_psqt_32(accPsqtIn[i], columnPsqtA[i]), - vec_add_psqt_32(columnPsqtR0[i], columnPsqtR1[i])); - } - } - else - { - for (IndexType i = 0; i < HalfDimensions / TileHeight; ++i) - { - // Load accumulator - auto accTileIn = reinterpret_cast( - &(computed->*accPtr).accumulation[Perspective][i * TileHeight]); - for (IndexType j = 0; j < NumRegs; ++j) - acc[j] = vec_load(&accTileIn[j]); - - // Difference calculation for the deactivated features - for (const auto index : removed) - { - const IndexType offset = HalfDimensions * index + i * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumRegs; ++j) - acc[j] = vec_sub_16(acc[j], column[j]); - } - - // Difference calculation for the activated features - for (const auto index : added) - { - const IndexType offset = HalfDimensions * index + i * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumRegs; ++j) - acc[j] = vec_add_16(acc[j], column[j]); - } - - // Store accumulator - auto accTileOut = reinterpret_cast( - &(next->*accPtr).accumulation[Perspective][i * TileHeight]); - for (IndexType j = 0; j < NumRegs; ++j) - vec_store(&accTileOut[j], acc[j]); - } - - for (IndexType i = 0; i < PSQTBuckets / PsqtTileHeight; ++i) - { - // Load accumulator - auto accTilePsqtIn = reinterpret_cast( - &(computed->*accPtr).psqtAccumulation[Perspective][i * PsqtTileHeight]); - for (std::size_t j = 0; j < NumPsqtRegs; ++j) - psqt[j] = vec_load_psqt(&accTilePsqtIn[j]); - - // Difference calculation for the deactivated features - for (const auto index : removed) - { - const IndexType offset = PSQTBuckets * index + i * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t j = 0; j < NumPsqtRegs; ++j) - psqt[j] = vec_sub_psqt_32(psqt[j], columnPsqt[j]); - } - - // Difference calculation for the activated features - for (const auto index : added) - { - const IndexType offset = PSQTBuckets * index + i * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t j = 0; j < NumPsqtRegs; ++j) - psqt[j] = vec_add_psqt_32(psqt[j], columnPsqt[j]); - } - - // Store accumulator - auto accTilePsqtOut = reinterpret_cast( - &(next->*accPtr).psqtAccumulation[Perspective][i * PsqtTileHeight]); - for (std::size_t j = 0; j < NumPsqtRegs; ++j) - vec_store_psqt(&accTilePsqtOut[j], psqt[j]); - } - } -#else - std::memcpy((next->*accPtr).accumulation[Perspective], - (computed->*accPtr).accumulation[Perspective], - HalfDimensions * sizeof(BiasType)); - std::memcpy((next->*accPtr).psqtAccumulation[Perspective], - (computed->*accPtr).psqtAccumulation[Perspective], - PSQTBuckets * sizeof(PSQTWeightType)); - - // Difference calculation for the deactivated features - for (const auto index : removed) - { - const IndexType offset = HalfDimensions * index; - for (IndexType i = 0; i < HalfDimensions; ++i) - (next->*accPtr).accumulation[Perspective][i] -= weights[offset + i]; - - for (std::size_t i = 0; i < PSQTBuckets; ++i) - (next->*accPtr).psqtAccumulation[Perspective][i] -= - psqtWeights[index * PSQTBuckets + i]; - } - - // Difference calculation for the activated features - for (const auto index : added) - { - const IndexType offset = HalfDimensions * index; - for (IndexType i = 0; i < HalfDimensions; ++i) - (next->*accPtr).accumulation[Perspective][i] += weights[offset + i]; - - for (std::size_t i = 0; i < PSQTBuckets; ++i) - (next->*accPtr).psqtAccumulation[Perspective][i] += - psqtWeights[index * PSQTBuckets + i]; - } -#endif + apply_accumulator_updates_incremental(computed, next, removed, + added); (next->*accPtr).computed[Perspective] = true; @@ -666,15 +194,26 @@ class FeatureTransformer { update_accumulator_incremental(pos, next); } + template + inline void apply_accumulator_updates_incremental(StateInfo* computed_st, + StateInfo* next, + FeatureSet::IndexList& removed, + FeatureSet::IndexList& added) const; + + // Update the accumluator for the current position and refresh the cache. + // + // Instead of rebuilding the accumulator from scratch, the accumulator is + // updated by applying the differences between it and the cached one. template void update_accumulator_refresh_cache(const Position& pos, AccumulatorCaches::Cache* cache) const { assert(cache != nullptr); - Square ksq = pos.square(Perspective); - auto& entry = (*cache)[ksq][Perspective]; FeatureSet::IndexList removed, added; + const Square ksq = pos.square(Perspective); + auto& entry = (*cache)[ksq][Perspective]; + for (Color c : {WHITE, BLACK}) { for (PieceType pt = PAWN; pt <= KING; ++pt) @@ -698,133 +237,24 @@ class FeatureTransformer { } } - auto& accumulator = pos.state()->*accPtr; - accumulator.computed[Perspective] = true; - -#ifdef VECTOR - vec_t acc[NumRegs]; - psqt_vec_t psqt[NumPsqtRegs]; - - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) - { - auto accTile = - reinterpret_cast(&accumulator.accumulation[Perspective][j * TileHeight]); - auto entryTile = reinterpret_cast(&entry.accumulation[j * TileHeight]); - - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = entryTile[k]; - - int i = 0; - for (; i < int(std::min(removed.size(), added.size())); ++i) - { - IndexType indexR = removed[i]; - const IndexType offsetR = HalfDimensions * indexR + j * TileHeight; - auto columnR = reinterpret_cast(&weights[offsetR]); - IndexType indexA = added[i]; - const IndexType offsetA = HalfDimensions * indexA + j * TileHeight; - auto columnA = reinterpret_cast(&weights[offsetA]); - - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k])); - } - for (; i < int(removed.size()); ++i) - { - IndexType index = removed[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_sub_16(acc[k], column[k]); - } - for (; i < int(added.size()); ++i) - { - IndexType index = added[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); - } - - for (IndexType k = 0; k < NumRegs; k++) - vec_store(&entryTile[k], acc[k]); - for (IndexType k = 0; k < NumRegs; k++) - vec_store(&accTile[k], acc[k]); - } - - for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) - { - auto accTilePsqt = reinterpret_cast( - &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); - auto entryTilePsqt = - reinterpret_cast(&entry.psqtAccumulation[j * PsqtTileHeight]); - - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = entryTilePsqt[k]; - - for (int i = 0; i < int(removed.size()); ++i) - { - IndexType index = removed[i]; - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); - } - for (int i = 0; i < int(added.size()); ++i) - { - IndexType index = added[i]; - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); - } - - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - vec_store_psqt(&entryTilePsqt[k], psqt[k]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - vec_store_psqt(&accTilePsqt[k], psqt[k]); - } - -#else - - for (const auto index : removed) - { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[j] -= weights[offset + j]; - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - entry.psqtAccumulation[k] -= psqtWeights[index * PSQTBuckets + k]; - } - for (const auto index : added) - { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[j] += weights[offset + j]; - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - entry.psqtAccumulation[k] += psqtWeights[index * PSQTBuckets + k]; - } - - // The accumulator of the refresh entry has been updated. - // Now copy its content to the actual accumulator we were refreshing. - - std::memcpy(accumulator.accumulation[Perspective], entry.accumulation, - sizeof(BiasType) * HalfDimensions); - - std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation, - sizeof(int32_t) * PSQTBuckets); -#endif - for (Color c : {WHITE, BLACK}) entry.byColorBB[c] = pos.pieces(c); for (PieceType pt = PAWN; pt <= KING; ++pt) entry.byTypeBB[pt] = pos.pieces(pt); + + auto& accumulator = pos.state()->*accPtr; + apply_accumulator_updates_refresh_cache(accumulator, entry, removed, added); + accumulator.computed[Perspective] = true; } + template + inline void apply_accumulator_updates_refresh_cache( + Accumulator& accumulator, + typename AccumulatorCaches::Cache::Entry& entry, + FeatureSet::IndexList removed, + FeatureSet::IndexList added) const; + template void hint_common_access_for_perspective(const Position& pos, AccumulatorCaches::Cache* cache) const { @@ -860,6 +290,9 @@ class FeatureTransformer { update_accumulator_refresh_cache(pos, cache); } + // Called in transform after both accumulators are updated. + inline void convert_accumulators(const Position& pos, OutputType* output) const; + template friend struct AccumulatorCaches::Cache; @@ -870,4 +303,18 @@ class FeatureTransformer { } // namespace Stockfish::Eval::NNUE -#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED +#if defined(__i386__) || defined(__amd64__) + + #include "arch/i386/nnue/nnue_feature_transformer.h" + +#elif defined(__arm__) || defined(__aarch64__) + + #include "arch/arm/nnue/nnue_feature_transformer.h" + +#else + + #include "arch/generic/nnue/nnue_feature_transformer.h" + +#endif + +#endif // NNUE_FEATURE_TRANSFORMER_H_INCLUDED diff --git a/src/position.cpp b/src/position.cpp index f596b015355..760c2d3ccb5 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) { for (Bitboard b = pos.checkers(); b;) os << UCIEngine::square(pop_lsb(b)) << " "; - if (int(Tablebases::MaxCardinality) >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING)) + if (Tablebases::MaxCardinality >= int(popcount(pos.pieces())) && !pos.can_castle(ANY_CASTLING)) { StateInfo st; ASSERT_ALIGNED(&st, Eval::NNUE::CacheLineSize); @@ -1328,7 +1328,7 @@ bool Position::pos_is_ok() const { for (Piece pc : Pieces) - if (pieceCount[pc] != popcount(pieces(color_of(pc), type_of(pc))) + if (pieceCount[pc] != int(popcount(pieces(color_of(pc), type_of(pc)))) || pieceCount[pc] != std::count(board, board + SQUARE_NB, pc)) assert(0 && "pos_is_ok: Pieces"); diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp index 9b24e700b18..e23235db5ff 100644 --- a/src/syzygy/tbprobe.cpp +++ b/src/syzygy/tbprobe.cpp @@ -709,7 +709,7 @@ int map_score(TBTable* entry, File f, int value, WDLScore wdl) { } // A temporary fix for the compiler bug with AVX-512. (#4450) -#ifdef USE_AVX512 +#ifdef __AVX512F__ #if defined(__clang__) && defined(__clang_major__) && __clang_major__ >= 15 #define CLANG_AVX512_BUG_FIX __attribute__((optnone)) #endif @@ -1729,7 +1729,7 @@ Config Tablebases::rank_root_moves(const OptionsMap& options, config.probeDepth = 0; } - if (config.cardinality >= popcount(pos.pieces()) && !pos.can_castle(ANY_CASTLING)) + if (config.cardinality >= int(popcount(pos.pieces())) && !pos.can_castle(ANY_CASTLING)) { // Rank moves using DTZ tables config.rootInTB = root_probe(pos, rootMoves, options["Syzygy50MoveRule"], rankDTZ); diff --git a/src/types.h b/src/types.h index b12491d6cdd..5afc6f0c026 100644 --- a/src/types.h +++ b/src/types.h @@ -19,89 +19,11 @@ #ifndef TYPES_H_INCLUDED #define TYPES_H_INCLUDED -// When compiling with provided Makefile (e.g. for Linux and OSX), configuration -// is done automatically. To get started type 'make help'. -// -// When Makefile is not used (e.g. with Microsoft Visual Studio) some switches -// need to be set manually: -// -// -DNDEBUG | Disable debugging mode. Always use this for release. -// -// -DNO_PREFETCH | Disable use of prefetch asm-instruction. You may need this to -// | run on some very old machines. -// -// -DUSE_POPCNT | Add runtime support for use of popcnt asm-instruction. Works -// | only in 64-bit mode and requires hardware with popcnt support. -// -// -DUSE_PEXT | Add runtime support for use of pext asm-instruction. Works -// | only in 64-bit mode and requires hardware with pext support. - #include #include - #if defined(_MSC_VER) - // Disable some silly and noisy warnings from MSVC compiler - #pragma warning(disable: 4127) // Conditional expression is constant - #pragma warning(disable: 4146) // Unary minus operator applied to unsigned type - #pragma warning(disable: 4800) // Forcing value to bool 'true' or 'false' - #endif - -// Predefined macros hell: -// -// __GNUC__ Compiler is GCC, Clang or ICX -// __clang__ Compiler is Clang or ICX -// __INTEL_LLVM_COMPILER Compiler is ICX -// _MSC_VER Compiler is MSVC -// _WIN32 Building on Windows (any) -// _WIN64 Building on Windows 64 bit - - #if defined(__GNUC__) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) \ - && defined(_WIN32) && !defined(__clang__) - #define ALIGNAS_ON_STACK_VARIABLES_BROKEN - #endif - - #define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast(ptr) % alignment == 0) - - #if defined(_WIN64) && defined(_MSC_VER) // No Makefile used - #include // Microsoft header for _BitScanForward64() - #define IS_64BIT - #endif - - #if defined(USE_POPCNT) && defined(_MSC_VER) - #include // Microsoft header for _mm_popcnt_u64() - #endif - - #if !defined(NO_PREFETCH) && defined(_MSC_VER) - #include // Microsoft header for _mm_prefetch() - #endif - - #if defined(USE_PEXT) - #include // Header for _pext_u64() intrinsic - #define pext(b, m) _pext_u64(b, m) - #else - #define pext(b, m) 0 - #endif - namespace Stockfish { - #ifdef USE_POPCNT -constexpr bool HasPopCnt = true; - #else -constexpr bool HasPopCnt = false; - #endif - - #ifdef USE_PEXT -constexpr bool HasPext = true; - #else -constexpr bool HasPext = false; - #endif - - #ifdef IS_64BIT -constexpr bool Is64Bit = true; - #else -constexpr bool Is64Bit = false; - #endif - using Key = uint64_t; using Bitboard = uint64_t;