diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2490c03..e78370a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,10 +8,15 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: recursive - - name: Install llvm 16 - run: sudo apt-get purge --auto-remove llvm python3-lldb-14 llvm-14 && wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh && sudo ./llvm.sh 16 + - name: Install llvm 18 + run: | + sudo apt-get purge --auto-remove llvm python3-lldb-14 llvm-14 + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 18 + rm llvm.sh - name: Build run: make diff --git a/Makefile b/Makefile index 8217c31..7fdd8e4 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,8 @@ - - -CC := clang-16 -LD := ld.lld-16 -OBJCOPY := llvm-objcopy-16 -AR := llvm-ar-16 -RANLIB := llvm-ranlib-16 +CC := clang-18 +LD := ld.lld-18 +OBJCOPY := llvm-objcopy-18 +AR := llvm-ar-18 +RANLIB := llvm-ranlib-18 UNAME := $(shell uname) ifeq ($(UNAME), Darwin) @@ -14,18 +12,19 @@ ifeq ($(UNAME), Darwin) AR := llvm-ar endif -CFLAGS := --target=riscv64 -march=rv64imc_zba_zbb_zbc_zbs -mabi=lp64 +CFLAGS := --target=riscv64 -march=rv64imc_zba_zbb_zbc_zbs -mabi=lp64 CFLAGS += -Os CFLAGS += -fdata-sections -ffunction-sections -fno-builtin -fvisibility=hidden -fomit-frame-pointer CFLAGS += -I compiler-rt/lib/builtins CFLAGS += -DVISIBILITY_HIDDEN -DCOMPILER_RT_HAS_FLOAT16 -RT_OBJ := build/fixunsdfdi.o \ +RT_OBJ := \ build/absvdi2.o \ build/absvsi2.o \ build/absvti2.o \ build/adddf3.o \ build/addsf3.o \ +build/addtf3.o \ build/addvdi3.o \ build/addvsi3.o \ build/addvti3.o \ @@ -36,6 +35,7 @@ build/ashrdi3.o \ build/ashrti3.o \ build/bswapdi2.o \ build/bswapsi2.o \ +build/clear_cache.o \ build/clzdi2.o \ build/clzsi2.o \ build/clzti2.o \ @@ -43,6 +43,9 @@ build/cmpdi2.o \ build/cmpti2.o \ build/comparedf2.o \ build/comparesf2.o \ +build/comparetf2.o \ +build/crtbegin.o \ +build/crtend.o \ build/ctzdi2.o \ build/ctzsi2.o \ build/ctzti2.o \ @@ -55,9 +58,16 @@ build/divmodti4.o \ build/divsc3.o \ build/divsf3.o \ build/divsi3.o \ +build/divtc3.o \ +build/divtf3.o \ build/divti3.o \ -build/extendsfdf2.o \ +build/extendbfsf2.o \ +build/extenddftf2.o \ build/extendhfsf2.o \ +build/extendhftf2.o \ +build/extendsfdf2.o \ +build/extendsftf2.o \ +build/extendxftf2.o \ build/ffsdi2.o \ build/ffssi2.o \ build/ffsti2.o \ @@ -67,25 +77,38 @@ build/fixdfti.o \ build/fixsfdi.o \ build/fixsfsi.o \ build/fixsfti.o \ +build/fixtfdi.o \ +build/fixtfsi.o \ +build/fixtfti.o \ build/fixunsdfdi.o \ build/fixunsdfsi.o \ build/fixunsdfti.o \ build/fixunssfdi.o \ build/fixunssfsi.o \ build/fixunssfti.o \ +build/fixunstfdi.o \ +build/fixunstfsi.o \ +build/fixunstfti.o \ build/floatdidf.o \ build/floatdisf.o \ +build/floatditf.o \ build/floatsidf.o \ build/floatsisf.o \ +build/floatsitf.o \ build/floattidf.o \ build/floattisf.o \ +build/floattitf.o \ build/floatundidf.o \ build/floatundisf.o \ +build/floatunditf.o \ build/floatunsidf.o \ build/floatunsisf.o \ +build/floatunsitf.o \ build/floatuntidf.o \ build/floatuntisf.o \ +build/floatuntitf.o \ build/fp_mode.o \ +build/gcc_personality_v0.o \ build/int_util.o \ build/lshrdi3.o \ build/lshrti3.o \ @@ -100,6 +123,8 @@ build/mulosi4.o \ build/muloti4.o \ build/mulsc3.o \ build/mulsf3.o \ +build/multc3.o \ +build/multf3.o \ build/multi3.o \ build/mulvdi3.o \ build/mulvsi3.o \ @@ -120,15 +145,23 @@ build/popcountsi2.o \ build/popcountti2.o \ build/powidf2.o \ build/powisf2.o \ +build/powitf2.o \ build/subdf3.o \ build/subsf3.o \ +build/subtf3.o \ build/subvdi3.o \ build/subvsi3.o \ build/subvti3.o \ build/trampoline_setup.o \ +build/truncdfbf2.o \ build/truncdfhf2.o \ build/truncdfsf2.o \ +build/truncsfbf2.o \ build/truncsfhf2.o \ +build/trunctfdf2.o \ +build/trunctfhf2.o \ +build/trunctfsf2.o \ +build/trunctfxf2.o \ build/ucmpdi2.o \ build/ucmpti2.o \ build/udivdi3.o \ @@ -139,38 +172,11 @@ build/udivsi3.o \ build/udivti3.o \ build/umoddi3.o \ build/umodsi3.o \ -build/umodti3.o \ -build/addtf3.o \ -build/comparetf2.o \ -build/divtc3.o \ -build/divtf3.o \ -build/extenddftf2.o \ -build/extendhftf2.o \ -build/extendsftf2.o \ -build/fixtfdi.o \ -build/fixtfsi.o \ -build/fixtfti.o \ -build/fixunstfdi.o \ -build/fixunstfsi.o \ -build/fixunstfti.o \ -build/floatditf.o \ -build/floatsitf.o \ -build/floattitf.o \ -build/floatunditf.o \ -build/floatunsitf.o \ -build/floatuntitf.o \ -build/multc3.o \ -build/multf3.o \ -build/powitf2.o \ -build/subtf3.o \ -build/trunctfdf2.o \ -build/trunctfhf2.o \ -build/trunctfsf2.o - -RISCV_OBJ := build/fp_mode.o build/muldi3.S.o -# build/save.o \ -# build/restore.o\ +build/umodti3.o +RISCV_OBJ := \ + build/fp_mode.o \ + build/muldi3.S.o all: build/libcompiler-rt.a @@ -189,6 +195,6 @@ build/muldi3.S.o: compiler-rt/lib/builtins/riscv/muldi3.S @echo build $< @$(CC) $(CFLAGS) -c -o $@ $< -clean: +clean: rm -f build/*.o rm -f build/*.a diff --git a/README.md b/README.md index 6d6e481..89d5c1f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,8 @@ we possess the capability to construct it entirely from scratch. ## Build To build it, run `make`, the static library will be generated at `build/libcompiler-rt.a`. Then use following Makefile configuration: -``` + +```text LDFLAGS += -L./build -lcompiler-rt ``` @@ -15,11 +16,7 @@ LDFLAGS += -L./build -lcompiler-rt This project is **completely** from the llvm project: - Repo: `https://github.com/llvm/llvm-project` -- Branch: `release/16.x` -- Commit: `7cbf1a259` - -At the same time, we pulled the following commit from LLVM main branch to fix some bugs in clang-16: - -- +- Branch: `release/18.x` +- Commit: `3b5b5c1` -See more: https://github.com/llvm/llvm-project/blob/release/16.x/compiler-rt/lib/builtins/README.txt +See more: https://github.com/llvm/llvm-project/blob/release/18.x/compiler-rt/lib/builtins/README.txt diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 2fc7052..13adbd6 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -3,14 +3,7 @@ # architecture-specific code in various subdirectories. if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - cmake_minimum_required(VERSION 3.13.4) - if ("${CMAKE_VERSION}" VERSION_LESS "3.20.0") - message(WARNING - "Your CMake version is ${CMAKE_VERSION}. Starting with LLVM 17.0.0, the " - "minimum version of CMake required to build LLVM will become 3.20.0, and " - "using an older CMake will become an error. Please upgrade your CMake to " - "at least 3.20.0 now to avoid issues in the future!") - endif() + cmake_minimum_required(VERSION 3.20.0) set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) project(CompilerRTBuiltins C ASM) @@ -45,6 +38,13 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) include(UseLibtool) endif() include(AddCompilerRT) + + if(MINGW) + # Simplified version of what's set in cmake/config-ix.cmake; not including + # builtins, which are linked separately. + set(MINGW_LIBRARIES mingw32 moldname mingwex msvcrt advapi32 shell32 + user32 kernel32 mingw32 moldname mingwex msvcrt) + endif() endif() if (COMPILER_RT_STANDALONE_BUILD) @@ -58,12 +58,9 @@ if (COMPILER_RT_STANDALONE_BUILD) endif() include(builtin-config-ix) +include(CMakeDependentOption) include(CMakePushCheckState) -if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") - include(CompilerRTAIXUtils) -endif() - option(COMPILER_RT_BUILTINS_HIDE_SYMBOLS "Do not export any symbols from the static library." ON) @@ -193,12 +190,11 @@ set(GENERIC_SOURCES # We only build BF16 files when "__bf16" is available. set(BF16_SOURCES + extendbfsf2.c truncdfbf2.c truncsfbf2.c ) -# TODO: Several "tf" files (and divtc3.c, but not multc3.c) are in -# GENERIC_SOURCES instead of here. set(GENERIC_TF_SOURCES addtf3.c comparetf2.c @@ -232,7 +228,7 @@ option(COMPILER_RT_EXCLUDE_ATOMIC_BUILTIN "Skip the atomic builtin (these should normally be provided by a shared library)" On) -if(NOT FUCHSIA AND NOT COMPILER_RT_BAREMETAL_BUILD) +if(NOT FUCHSIA AND NOT COMPILER_RT_BAREMETAL_BUILD AND NOT COMPILER_RT_GPU_BUILD) set(GENERIC_SOURCES ${GENERIC_SOURCES} emutls.c @@ -241,6 +237,14 @@ if(NOT FUCHSIA AND NOT COMPILER_RT_BAREMETAL_BUILD) ) endif() +option(COMPILER_RT_LIBATOMIC_USE_PTHREAD + "Whether libatomic should use pthreads if available." + Off) + +if(COMPILER_RT_LIBATOMIC_USE_PTHREAD) + add_compile_definitions(_LIBATOMIC_USE_PTHREAD) +endif() + if(COMPILER_RT_HAS_ATOMIC_KEYWORD AND NOT COMPILER_RT_EXCLUDE_ATOMIC_BUILTIN) set(GENERIC_SOURCES ${GENERIC_SOURCES} @@ -276,7 +280,7 @@ endif() # These files are used on 32-bit and 64-bit x86. set(x86_ARCH_SOURCES - cpu_model.c + cpu_model/x86.c ) if (NOT MSVC) @@ -290,6 +294,7 @@ endif () # long double is not 80 bits on Android or MSVC. set(x86_80_BIT_SOURCES divxc3.c + extendxftf2.c fixxfdi.c fixxfti.c fixunsxfdi.c @@ -301,6 +306,7 @@ set(x86_80_BIT_SOURCES floatuntixf.c mulxc3.c powixf2.c + trunctfxf2.c ) if (NOT MSVC) @@ -310,17 +316,27 @@ if (NOT MSVC) ${x86_ARCH_SOURCES} x86_64/floatdidf.c x86_64/floatdisf.c - x86_64/floatundidf.S - x86_64/floatundisf.S ) + if (NOT WIN32) + set(x86_64_SOURCES + ${x86_64_SOURCES} + x86_64/floatundidf.S + x86_64/floatundisf.S + ) + endif() if (NOT ANDROID) set(x86_64_SOURCES ${x86_64_SOURCES} ${x86_80_BIT_SOURCES} x86_64/floatdixf.c - x86_64/floatundixf.S ) + if (NOT WIN32) + set(x86_64_SOURCES + ${x86_64_SOURCES} + x86_64/floatundixf.S + ) + endif() endif() # Darwin x86_64 Haswell @@ -330,7 +346,6 @@ if (NOT MSVC) set(x86_64_SOURCES ${x86_64_SOURCES} x86_64/chkstk.S - x86_64/chkstk2.S ) endif() @@ -364,7 +379,6 @@ if (NOT MSVC) set(i386_SOURCES ${i386_SOURCES} i386/chkstk.S - i386/chkstk2.S ) endif() else () # MSVC @@ -551,10 +565,29 @@ endif() set(aarch64_SOURCES ${GENERIC_TF_SOURCES} ${GENERIC_SOURCES} - cpu_model.c + cpu_model/aarch64.c aarch64/fp_mode.c ) +if (COMPILER_RT_HAS_AARCH64_SME) + if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD)) + list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c) + message(STATUS "AArch64 SME ABI routines enabled") + set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin") + else() + if(COMPILER_RT_DISABLE_AARCH64_FMV) + message(WARNING "AArch64 SME ABI routines require function multiversioning support.") + endif() + if(NOT COMPILER_RT_HAS_FNO_BUILTIN_FLAG) + message(WARNING "AArch64 SME ABI routines require '-fno-builtin'") + endif() + if(NOT (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD)) + message(WARNING "AArch64 SME ABI routines requires sys/auxv.h or COMPILER_RT_BAREMETAL_BUILD flag") + endif() + message(STATUS "AArch64 SME ABI routines disabled") + endif() +endif() + # Generate outline atomics helpers from lse.S base set(OA_HELPERS_DIR "${CMAKE_CURRENT_BINARY_DIR}/outline_atomic_helpers.dir") file(MAKE_DIRECTORY "${OA_HELPERS_DIR}") @@ -567,11 +600,15 @@ endif() foreach(pat cas swp ldadd ldclr ldeor ldset) foreach(size 1 2 4 8 16) - foreach(model 1 2 3 4) + foreach(model 1 2 3 4 5) if(pat STREQUAL "cas" OR NOT size STREQUAL "16") + set(source_asm "${CMAKE_CURRENT_SOURCE_DIR}/aarch64/lse.S") set(helper_asm "${OA_HELPERS_DIR}/outline_atomic_${pat}${size}_${model}.S") - list(APPEND lse_builtins "${helper_asm}") - list(APPEND arm64_lse_commands COMMAND ${CMAKE_COMMAND} -E ${COMPILER_RT_LINK_OR_COPY} "${CMAKE_CURRENT_SOURCE_DIR}/aarch64/lse.S" "${helper_asm}") + add_custom_command( + OUTPUT "${helper_asm}" + COMMAND ${CMAKE_COMMAND} -E ${COMPILER_RT_LINK_OR_COPY} "${source_asm}" "${helper_asm}" + DEPENDS "${source_asm}" + ) set_source_files_properties("${helper_asm}" PROPERTIES COMPILE_DEFINITIONS "L_${pat};SIZE=${size};MODEL=${model}" @@ -590,6 +627,8 @@ if (MINGW) ) endif() +set(amdgcn_SOURCES ${GENERIC_SOURCES}) + set(armv4t_SOURCES ${arm_min_SOURCES}) set(armv5te_SOURCES ${arm_min_SOURCES}) set(armv6_SOURCES ${arm_min_SOURCES}) @@ -605,6 +644,7 @@ set(arm64_32_SOURCES ${aarch64_SOURCES}) set(armv6m_SOURCES ${thumb1_SOURCES}) set(armv7m_SOURCES ${arm_SOURCES}) set(armv7em_SOURCES ${arm_SOURCES}) +set(armv8m.base_SOURCES ${thumb1_SOURCES}) set(armv8m.main_SOURCES ${arm_SOURCES}) set(armv8.1m.main_SOURCES ${arm_SOURCES}) @@ -668,6 +708,8 @@ set(mips64_SOURCES ${GENERIC_TF_SOURCES} set(mips64el_SOURCES ${GENERIC_TF_SOURCES} ${mips_SOURCES}) +set(nvptx64_SOURCES ${GENERIC_SOURCES}) + set(powerpc_SOURCES ${GENERIC_SOURCES}) set(powerpcspe_SOURCES ${GENERIC_SOURCES}) @@ -686,7 +728,7 @@ set(powerpc64_SOURCES ${GENERIC_SOURCES} ) # These routines require __int128, which isn't supported on AIX. -if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") +if (NOT OS_NAME MATCHES "AIX") set(powerpc64_SOURCES ppc/floattitf.c ppc/fixtfti.c @@ -697,6 +739,7 @@ endif() set(powerpc64le_SOURCES ${powerpc64_SOURCES}) set(riscv_SOURCES + cpu_model/riscv.c riscv/fp_mode.c riscv/save.S riscv/restore.S @@ -731,7 +774,11 @@ set(ve_SOURCES ${GENERIC_SOURCES}) add_custom_target(builtins) -set_target_properties(builtins PROPERTIES FOLDER "Compiler-RT Misc") +set_target_properties(builtins PROPERTIES FOLDER "Compiler-RT/Metatargets") + +option(COMPILER_RT_ENABLE_SOFTWARE_INT128 + "Enable the int128 builtin routines for all targets." + OFF) if (APPLE) add_subdirectory(Darwin-excludes) @@ -746,6 +793,13 @@ else () endif() append_list_if(COMPILER_RT_HAS_STD_C11_FLAG -std=c11 BUILTIN_CFLAGS) + append_list_if(COMPILER_RT_HAS_WBUILTIN_DECLARATION_MISMATCH_FLAG -Werror=builtin-declaration-mismatch BUILTIN_CFLAGS) + + # Don't embed directives for picking any specific CRT + if (MSVC) + set(CMAKE_MSVC_RUNTIME_LIBRARY "") + append_list_if(COMPILER_RT_HAS_ZL_FLAG /Zl BUILTIN_CFLAGS) + endif() # These flags would normally be added to CMAKE_C_FLAGS by the llvm # cmake step. Add them manually if this is a standalone build. @@ -762,6 +816,21 @@ else () endif() endif() + # Directly targeting the GPU requires a few extra flags. + if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx") + append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding BUILTIN_CFLAGS) + append_list_if(COMPILER_RT_HAS_NOGPULIB_FLAG -nogpulib BUILTIN_CFLAGS) + append_list_if(COMPILER_RT_HAS_FLTO_FLAG -flto BUILTIN_CFLAGS) + append_list_if(COMPILER_RT_HAS_FCONVERGENT_FUNCTIONS_FLAG + -fconvergent-functions BUILTIN_CFLAGS) + + # AMDGPU targets want to use a generic ABI. + if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn") + append_list_if(COMPILER_RT_HAS_CODE_OBJECT_VERSION_FLAG + "SHELL:-Xclang -mcode-object-version=none" BUILTIN_CFLAGS) + endif() + endif() + set(BUILTIN_DEFS "") if(COMPILER_RT_BUILTINS_HIDE_SYMBOLS) @@ -820,20 +889,10 @@ else () # For RISCV32, we must force enable int128 for compiling long # double routines. - if("${arch}" STREQUAL "riscv32") + if(COMPILER_RT_ENABLE_SOFTWARE_INT128 OR "${arch}" STREQUAL "riscv32") list(APPEND BUILTIN_CFLAGS_${arch} -fforce-enable-int128) endif() - if(arch STREQUAL "aarch64") - add_custom_target( - lse_builtin_symlinks - BYPRODUCTS ${lse_builtins} - ${arm64_lse_commands} - ) - - set(deps_aarch64 lse_builtin_symlinks) - endif() - add_compiler_rt_runtime(clang_rt.builtins STATIC ARCHS ${arch} @@ -847,41 +906,44 @@ else () endforeach () endif () +add_dependencies(compiler-rt builtins) + option(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC "Build standalone shared atomic library." OFF) if(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC) add_custom_target(builtins-standalone-atomic) - set(BUILTIN_DEPS "") set(BUILTIN_TYPE SHARED) - if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") + if(OS_NAME MATCHES "AIX") + include(CompilerRTAIXUtils) if(NOT COMPILER_RT_LIBATOMIC_LINK_FLAGS) get_aix_libatomic_default_link_flags(COMPILER_RT_LIBATOMIC_LINK_FLAGS "${CMAKE_CURRENT_SOURCE_DIR}/ppc/atomic.exp") endif() - # The compiler needs builtins to link any other binaries, so let - # clang_rt.atomic be built after builtins. - set(BUILTIN_DEPS builtins) # For different versions of cmake, SHARED behaves differently. For some # versions, we might need MODULE rather than SHARED. get_aix_libatomic_type(BUILTIN_TYPE) + else() + list(APPEND COMPILER_RT_LIBATOMIC_LINK_FLAGS -nodefaultlibs) endif() foreach (arch ${BUILTIN_SUPPORTED_ARCH}) if(CAN_TARGET_${arch}) + list(APPEND COMPILER_RT_LIBATOMIC_LINK_LIBS_${arch} clang_rt.builtins-${arch}) + append_list_if(MINGW "${MINGW_LIBRARIES}" COMPILER_RT_LIBATOMIC_LINK_LIBS_${arch}) add_compiler_rt_runtime(clang_rt.atomic ${BUILTIN_TYPE} ARCHS ${arch} SOURCES atomic.c LINK_FLAGS ${COMPILER_RT_LIBATOMIC_LINK_FLAGS} - DEPS ${BUILTIN_DEPS} + LINK_LIBS ${COMPILER_RT_LIBATOMIC_LINK_LIBS_${arch}} PARENT_TARGET builtins-standalone-atomic) endif() endforeach() # FIXME: On AIX, we have to archive built shared libraries into a static # archive, i.e., libatomic.a. Once cmake adds support of such usage for AIX, # this ad-hoc part can be removed. - if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") + if(OS_NAME MATCHES "AIX") archive_aix_libatomic(clang_rt.atomic libatomic ARCHS ${BUILTIN_SUPPORTED_ARCH} PARENT_TARGET builtins-standalone-atomic) @@ -889,4 +951,40 @@ if(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC) add_dependencies(compiler-rt builtins-standalone-atomic) endif() -add_dependencies(compiler-rt builtins) +cmake_dependent_option(COMPILER_RT_BUILD_CRT "Build crtbegin.o/crtend.o" ON "COMPILER_RT_HAS_CRT" OFF) + +if (COMPILER_RT_BUILD_CRT) + add_compiler_rt_component(crt) + + option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" ON) + + include(CheckSectionExists) + check_section_exists(".init_array" COMPILER_RT_HAS_INITFINI_ARRAY + SOURCE "volatile int x;\n__attribute__((constructor)) void f(void) {x = 0;}\nint main(void) { return 0; }\n") + + append_list_if(COMPILER_RT_HAS_STD_C11_FLAG -std=c11 CRT_CFLAGS) + append_list_if(COMPILER_RT_HAS_INITFINI_ARRAY -DCRT_HAS_INITFINI_ARRAY CRT_CFLAGS) + append_list_if(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY -DEH_USE_FRAME_REGISTRY CRT_CFLAGS) + append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC CRT_CFLAGS) + append_list_if(COMPILER_RT_HAS_WNO_PEDANTIC -Wno-pedantic CRT_CFLAGS) + if (COMPILER_RT_HAS_FCF_PROTECTION_FLAG) + append_list_if(COMPILER_RT_ENABLE_CET -fcf-protection=full CRT_CFLAGS) + endif() + + foreach(arch ${BUILTIN_SUPPORTED_ARCH}) + add_compiler_rt_runtime(clang_rt.crtbegin + OBJECT + ARCHS ${arch} + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/crtbegin.c + CFLAGS ${CRT_CFLAGS} + PARENT_TARGET crt) + add_compiler_rt_runtime(clang_rt.crtend + OBJECT + ARCHS ${arch} + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/crtend.c + CFLAGS ${CRT_CFLAGS} + PARENT_TARGET crt) + endforeach() + + add_dependencies(compiler-rt crt) +endif() diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt index 53d656d..19f26c9 100644 --- a/compiler-rt/lib/builtins/README.txt +++ b/compiler-rt/lib/builtins/README.txt @@ -35,13 +35,13 @@ typedef uint64_t du_int; // Integral bit manipulation -di_int __ashldi3(di_int a, si_int b); // a << b -ti_int __ashlti3(ti_int a, si_int b); // a << b +di_int __ashldi3(di_int a, int b); // a << b +ti_int __ashlti3(ti_int a, int b); // a << b -di_int __ashrdi3(di_int a, si_int b); // a >> b arithmetic (sign fill) -ti_int __ashrti3(ti_int a, si_int b); // a >> b arithmetic (sign fill) -di_int __lshrdi3(di_int a, si_int b); // a >> b logical (zero fill) -ti_int __lshrti3(ti_int a, si_int b); // a >> b logical (zero fill) +di_int __ashrdi3(di_int a, int b); // a >> b arithmetic (sign fill) +ti_int __ashrti3(ti_int a, int b); // a >> b arithmetic (sign fill) +di_int __lshrdi3(di_int a, int b); // a >> b logical (zero fill) +ti_int __lshrti3(ti_int a, int b); // a >> b logical (zero fill) int __clzsi2(si_int a); // count leading zeros int __clzdi2(di_int a); // count leading zeros @@ -137,49 +137,54 @@ si_int __ucmpti2(tu_int a, tu_int b); di_int __fixsfdi( float a); di_int __fixdfdi( double a); di_int __fixxfdi(long double a); +di_int __fixtfdi( tf_float a); ti_int __fixsfti( float a); ti_int __fixdfti( double a); ti_int __fixxfti(long double a); -uint64_t __fixtfdi(long double input); // ppc only, doesn't match documentation +ti_int __fixtfti( tf_float a); su_int __fixunssfsi( float a); su_int __fixunsdfsi( double a); su_int __fixunsxfsi(long double a); +su_int __fixunstfsi( tf_float a); du_int __fixunssfdi( float a); du_int __fixunsdfdi( double a); du_int __fixunsxfdi(long double a); +du_int __fixunstfdi( tf_float a); tu_int __fixunssfti( float a); tu_int __fixunsdfti( double a); tu_int __fixunsxfti(long double a); -uint64_t __fixunstfdi(long double input); // ppc only +tu_int __fixunstfti( tf_float a); float __floatdisf(di_int a); double __floatdidf(di_int a); long double __floatdixf(di_int a); -long double __floatditf(int64_t a); // ppc only +tf_float __floatditf(int64_t a); float __floattisf(ti_int a); double __floattidf(ti_int a); long double __floattixf(ti_int a); +tf_float __floattitf(ti_int a); float __floatundisf(du_int a); double __floatundidf(du_int a); long double __floatundixf(du_int a); -long double __floatunditf(uint64_t a); // ppc only +tf_float __floatunditf(du_int a); float __floatuntisf(tu_int a); double __floatuntidf(tu_int a); long double __floatuntixf(tu_int a); +tf_float __floatuntixf(tu_int a); // Floating point raised to integer power float __powisf2( float a, int b); // a ^ b double __powidf2( double a, int b); // a ^ b long double __powixf2(long double a, int b); // a ^ b -long double __powitf2(long double a, int b); // ppc only, a ^ b +tf_float __powitf2( tf_float a, int b); // a ^ b // Complex arithmetic @@ -189,8 +194,7 @@ long double __powitf2(long double a, int b); // ppc only, a ^ b double _Complex __muldc3(double a, double b, double c, double d); long double _Complex __mulxc3(long double a, long double b, long double c, long double d); -long double _Complex __multc3(long double a, long double b, - long double c, long double d); // ppc only + tf_float _Complex __multc3(tf_float a, tf_float b, tf_float c, tf_float d); // (a + ib) / (c + id) @@ -198,8 +202,7 @@ long double _Complex __multc3(long double a, long double b, double _Complex __divdc3(double a, double b, double c, double d); long double _Complex __divxc3(long double a, long double b, long double c, long double d); -long double _Complex __divtc3(long double a, long double b, - long double c, long double d); // ppc only + tf_float _Complex __divtc3(tf_float a, tf_float b, tf_float c, tf_float d); // Runtime support @@ -269,6 +272,11 @@ switch32 switch8 switchu8 +// This function generates a custom trampoline function with the specific +// realFunc and localsPtr values. +void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated, + const void* realFunc, void* localsPtr); + // There is no C interface to the *_vfp_d8_d15_regs functions. There are // called in the prolog and epilog of Thumb1 functions. When the C++ ABI use // SJLJ for exceptions, each function with a catch clause or destructors needs diff --git a/compiler-rt/lib/builtins/aarch64/lse.S b/compiler-rt/lib/builtins/aarch64/lse.S index 5dc0d53..1fe18f4 100644 --- a/compiler-rt/lib/builtins/aarch64/lse.S +++ b/compiler-rt/lib/builtins/aarch64/lse.S @@ -7,7 +7,7 @@ // Out-of-line LSE atomics helpers. Ported from libgcc library. // N = {1, 2, 4, 8} // M = {1, 2, 4, 8, 16} -// ORDER = {'relax', 'acq', 'rel', 'acq_rel'} +// ORDER = {'relax', 'acq', 'rel', 'acq_rel', 'sync'} // Routines implemented: // // iM __aarch64_casM_ORDER(iM expected, iM desired, iM *ptr) @@ -35,8 +35,8 @@ HIDDEN(___aarch64_have_lse_atomics) #endif // Generate mnemonics for -// L_cas: SIZE: 1,2,4,8,16 MODEL: 1,2,3,4 -// L_swp L_ldadd L_ldclr L_ldeor L_ldset: SIZE: 1,2,4,8 MODEL: 1,2,3,4 +// L_cas: SIZE: 1,2,4,8,16 MODEL: 1,2,3,4,5 +// L_swp L_ldadd L_ldclr L_ldeor L_ldset: SIZE: 1,2,4,8 MODEL: 1,2,3,4,5 #if SIZE == 1 #define S b @@ -64,24 +64,44 @@ HIDDEN(___aarch64_have_lse_atomics) #define L #define M 0x000000 #define N 0x000000 +#define BARRIER #elif MODEL == 2 #define SUFF _acq #define A a #define L #define M 0x400000 #define N 0x800000 +#define BARRIER #elif MODEL == 3 #define SUFF _rel #define A #define L l #define M 0x008000 #define N 0x400000 +#define BARRIER #elif MODEL == 4 #define SUFF _acq_rel #define A a #define L l #define M 0x408000 #define N 0xc00000 +#define BARRIER +#elif MODEL == 5 +#define SUFF _sync +#ifdef L_swp +// swp has _acq semantics. +#define A a +#define L +#define M 0x400000 +#define N 0x800000 +#else +// All other _sync functions have _seq semantics. +#define A a +#define L l +#define M 0x408000 +#define N 0xc00000 +#endif +#define BARRIER dmb ish #else #error #endif // MODEL @@ -96,7 +116,12 @@ HIDDEN(___aarch64_have_lse_atomics) #endif #define NAME(BASE) GLUE4(__aarch64_, BASE, SIZE, SUFF) +#if MODEL == 5 +// Drop A for _sync functions. +#define LDXR GLUE3(ld, xr, S) +#else #define LDXR GLUE4(ld, A, xr, S) +#endif #define STXR GLUE4(st, L, xr, S) // Define temporary registers. @@ -136,9 +161,15 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(cas)) STXR w(tmp1), s(1), [x2] cbnz w(tmp1), 0b 1: + BARRIER ret #else +#if MODEL == 5 +// Drop A for _sync functions. +#define LDXP GLUE2(ld, xp) +#else #define LDXP GLUE3(ld, A, xp) +#endif #define STXP GLUE3(st, L, xp) #ifdef HAS_ASM_LSE #define CASP GLUE3(casp, A, L) x0, x1, x2, x3, [x4] @@ -159,6 +190,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(cas)) STXP w(tmp2), x2, x3, [x4] cbnz w(tmp2), 0b 1: + BARRIER ret #endif END_COMPILERRT_OUTLINE_FUNCTION(NAME(cas)) @@ -180,6 +212,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(swp)) LDXR s(0), [x1] STXR w(tmp1), s(tmp0), [x1] cbnz w(tmp1), 0b + BARRIER ret END_COMPILERRT_OUTLINE_FUNCTION(NAME(swp)) #endif // L_swp @@ -224,6 +257,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(LDNM)) OP s(tmp1), s(0), s(tmp0) STXR w(tmp2), s(tmp1), [x1] cbnz w(tmp2), 0b + BARRIER ret END_COMPILERRT_OUTLINE_FUNCTION(NAME(LDNM)) #endif // L_ldadd L_ldclr L_ldeor L_ldset diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-init.c b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c new file mode 100644 index 0000000..b6ee121 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c @@ -0,0 +1,52 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +__attribute__((visibility("hidden"), nocommon)) +_Bool __aarch64_has_sme_and_tpidr2_el0; + +// We have multiple ways to check that the function has SME, depending on our +// target. +// * For Linux we can use __getauxval(). +// * For newlib we can use __aarch64_sme_accessible(). + +#if defined(__linux__) + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif + +#ifndef HWCAP2_SME +#define HWCAP2_SME (1 << 23) +#endif + +extern unsigned long int __getauxval (unsigned long int); + +static _Bool has_sme(void) { + return __getauxval(AT_HWCAP2) & HWCAP2_SME; +} + +#else // defined(__linux__) + +#if defined(COMPILER_RT_SHARED_LIB) +__attribute__((weak)) +#endif +extern _Bool __aarch64_sme_accessible(void); + +static _Bool has_sme(void) { +#if defined(COMPILER_RT_SHARED_LIB) + if (!__aarch64_sme_accessible) + return 0; +#endif + return __aarch64_sme_accessible(); +} + +#endif // defined(__linux__) + +#if __GNUC__ >= 9 +#pragma GCC diagnostic ignored "-Wprio-ctor-dtor" +#endif +__attribute__((constructor(90))) +static void init_aarch64_has_sme(void) { + __aarch64_has_sme_and_tpidr2_el0 = has_sme(); +} diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c new file mode 100644 index 0000000..4b9ee8c --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c @@ -0,0 +1,18 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "../cpu_model/aarch64.h" + +struct FEATURES { + unsigned long long features; +}; + +extern struct FEATURES __aarch64_cpu_features; + +CONSTRUCTOR_ATTRIBUTE static void get_aarch64_cpu_features(void) { + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + + __init_cpu_features(); +} diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S new file mode 100644 index 0000000..3e9bd2c --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S @@ -0,0 +1,230 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// This patch implements the support routines for the SME ABI, +// described here: +// https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines + +#include "../assembly.h" + + +#if !defined(__APPLE__) +#define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0) +#define TPIDR2_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0) +#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features) +#define CPU_FEATS_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_cpu_features) +#else +// MachO requires @page/@pageoff directives because the global is defined +// in a different file. Otherwise this file may fail to build. +#define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@page +#define TPIDR2_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@pageoff +#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)@page +#define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff +#endif + +.arch armv9-a+sme + +// Utility function which calls a system's abort() routine. Because the function +// is streaming-compatible it should disable streaming-SVE mode before calling +// abort(). Note that there is no need to preserve any state before the call, +// because the function does not return. +DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort) + .cfi_startproc + .variant_pcs SYMBOL_NAME(do_abort) + BTI_C + stp x29, x30, [sp, #-32]! + cntd x0 + // Store VG to a stack location that we describe with .cfi_offset + str x0, [sp, #16] + .cfi_def_cfa_offset 32 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + .cfi_offset 46, -16 + bl __arm_sme_state + tbz x0, #0, 2f +1: + smstop sm +2: + // We can't make this into a tail-call because the unwinder would + // need to restore the value of VG. + bl SYMBOL_NAME(abort) + .cfi_endproc +END_COMPILERRT_FUNCTION(do_abort) + +// __arm_sme_state fills the result registers based on a local +// that is set as part of the compiler-rt startup code. +// __aarch64_has_sme_and_tpidr2_el0 +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state) + .variant_pcs __arm_sme_state + BTI_C + mov x0, xzr + mov x1, xzr + + adrp x16, TPIDR2_SYMBOL + ldrb w16, [x16, TPIDR2_SYMBOL_OFFSET] + cbz w16, 1f +0: + orr x0, x0, #0xC000000000000000 + mrs x16, SVCR + bfxil x0, x16, #0, #2 + mrs x1, TPIDR2_EL0 +1: + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore) + .variant_pcs __arm_tpidr2_restore + BTI_C + // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific + // manner. + mrs x14, TPIDR2_EL0 + cbnz x14, 2f + + // If any of the reserved bytes in the first 16 bytes of BLK are nonzero, + // the subroutine [..] aborts in some platform-defined manner. + ldrh w14, [x0, #10] + cbnz w14, 2f + ldr w14, [x0, #12] + cbnz w14, 2f + + // If BLK.za_save_buffer is NULL, the subroutine does nothing. + ldr x16, [x0] + cbz x16, 1f + + // If BLK.num_za_save_slices is zero, the subroutine does nothing. + ldrh w14, [x0, #8] + cbz x14, 1f + + mov x15, xzr +0: + ldr za[w15,0], [x16] + addsvl x16, x16, #1 + add x15, x15, #1 + cmp x14, x15 + b.ne 0b +1: + ret +2: + b SYMBOL_NAME(do_abort) +END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save) + .variant_pcs __arm_tpidr2_save + BTI_C + // If the current thread does not have access to TPIDR2_EL0, the subroutine + // does nothing. + adrp x14, TPIDR2_SYMBOL + ldrb w14, [x14, TPIDR2_SYMBOL_OFFSET] + cbz w14, 1f + + // If TPIDR2_EL0 is null, the subroutine does nothing. + mrs x16, TPIDR2_EL0 + cbz x16, 1f + + // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are + // nonzero, the subroutine [..] aborts in some platform-defined manner. + ldrh w14, [x16, #10] + cbnz w14, 2f + ldr w14, [x16, #12] + cbnz w14, 2f + + // If num_za_save_slices is zero, the subroutine does nothing. + ldrh w14, [x16, #8] + cbz x14, 1f + + // If za_save_buffer is NULL, the subroutine does nothing. + ldr x16, [x16] + cbz x16, 1f + + mov x15, xzr +0: + str za[w15,0], [x16] + addsvl x16, x16, #1 + add x15, x15, #1 + cmp x14, x15 + b.ne 0b +1: + ret +2: + b SYMBOL_NAME(do_abort) +END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable) + .variant_pcs __arm_za_disable + BTI_C + // If the current thread does not have access to SME, the subroutine does + // nothing. + adrp x14, TPIDR2_SYMBOL + ldrb w14, [x14, TPIDR2_SYMBOL_OFFSET] + cbz w14, 0f + + // Otherwise, the subroutine behaves as if it did the following: + // * Call __arm_tpidr2_save. + stp x29, x30, [sp, #-16]! + .cfi_def_cfa_offset 16 + mov x29, sp + .cfi_def_cfa w29, 16 + .cfi_offset w30, -8 + .cfi_offset w29, -16 + bl __arm_tpidr2_save + + // * Set TPIDR2_EL0 to null. + msr TPIDR2_EL0, xzr + + // * Set PSTATE.ZA to 0. + smstop za + + .cfi_def_cfa wsp, 16 + ldp x29, x30, [sp], #16 + .cfi_def_cfa_offset 0 + .cfi_restore w30 + .cfi_restore w29 +0: + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable) + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg) + .variant_pcs __arm_get_current_vg + BTI_C + + stp x29, x30, [sp, #-16]! + .cfi_def_cfa_offset 16 + mov x29, sp + .cfi_def_cfa w29, 16 + .cfi_offset w30, -8 + .cfi_offset w29, -16 + adrp x17, CPU_FEATS_SYMBOL + ldr w17, [x17, CPU_FEATS_SYMBOL_OFFSET] + tbnz w17, #30, 0f + adrp x16, TPIDR2_SYMBOL + ldrb w16, [x16, TPIDR2_SYMBOL_OFFSET] + cbz w16, 1f +0: + mov x18, x1 + bl __arm_sme_state + mov x1, x18 + and x17, x17, #0x40000000 + bfxil x17, x0, #0, #1 + cbz x17, 1f + cntd x0 + .cfi_def_cfa wsp, 16 + ldp x29, x30, [sp], #16 + .cfi_def_cfa_offset 0 + .cfi_restore w30 + .cfi_restore w29 + ret +1: + mov x0, xzr + .cfi_def_cfa wsp, 16 + ldp x29, x30, [sp], #16 + .cfi_def_cfa_offset 0 + .cfi_restore w30 + .cfi_restore w29 + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg) + +NO_EXEC_STACK_DIRECTIVE + +// GNU property note for BTI and PAC +GNU_PROPERTY_BTI_PAC diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S new file mode 100644 index 0000000..0318d9a --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S @@ -0,0 +1,352 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Routines taken from libc/AOR_v20.02/string/aarch64 + +#include "../assembly.h" + +#ifdef __aarch64__ + +#define L(l) .L ## l + +// +// __arm_sc_memcpy / __arm_sc_memmove +// + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend1 x4 +#define dstend1 x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend1 +#define tmp1 x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy) + add srcend1, src, count + add dstend1, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend1, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend1, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend1, -8] + str A_l, [dstin] + str A_h, [dstend1, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend1, -4] + str A_lw, [dstin] + str B_lw, [dstend1, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend1, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend1, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend1, -32] + ldp D_l, D_h, [srcend1, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend1, -32] + stp D_l, D_h, [dstend1, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend1, -64] + ldp H_l, H_h, [srcend1, -48] + stp G_l, G_h, [dstend1, -64] + stp H_l, H_h, [dstend1, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend1, -32] + stp D_l, D_h, [dstend1, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend1, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend1, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend1, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend1, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend1, -64] + stp A_l, A_h, [dstend1, -48] + stp B_l, B_h, [dstend1, -32] + stp C_l, C_h, [dstend1, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend1, -16] + and tmp1, dstend1, 15 + sub srcend1, srcend1, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend1, -16] + stp D_l, D_h, [dstend1, -16] + ldp B_l, B_h, [srcend1, -32] + ldp C_l, C_h, [srcend1, -48] + ldp D_l, D_h, [srcend1, -64]! + sub dstend1, dstend1, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend1, -16] + ldp A_l, A_h, [srcend1, -16] + stp B_l, B_h, [dstend1, -32] + ldp B_l, B_h, [srcend1, -32] + stp C_l, C_h, [dstend1, -48] + ldp C_l, C_h, [srcend1, -48] + stp D_l, D_h, [dstend1, -64]! + ldp D_l, D_h, [srcend1, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend1, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend1, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend1, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend1, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy) + +DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy) + + +// +// __arm_sc_memset +// + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend2 x4 +#define zva_val x5 + +DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset) +#ifdef __ARM_FEATURE_SVE + mov z0.b, valw +#else + bfi valw, valw, #8, #8 + bfi valw, valw, #16, #16 + bfi val, val, #32, #32 + fmov d0, val + fmov v0.d[1], val +#endif + add dstend2, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend2, -8] + ret + nop +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend2, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend2, -2] +3: ret + + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend2, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend2, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend2, -32] + ret + + .p2align 4 +L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + sub count, dstend2, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva_loop) + stp q0, q0, [dstend2, -64] + stp q0, q0, [dstend2, -32] + ret + +L(no_zva): + sub count, dstend2, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + stp q0, q0, [dstend2, -64] + stp q0, q0, [dstend2, -32] + ret +END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset) + +#endif // __aarch64__ diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c new file mode 100644 index 0000000..315490e --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c @@ -0,0 +1,12 @@ +#include + +const void *__arm_sc_memchr(const void *src, int c, + size_t n) __arm_streaming_compatible { + const unsigned char *srcp = (const unsigned char *)src; + unsigned char c8 = (unsigned char)c; + for (size_t i = 0; i < n; ++i) + if (srcp[i] == c8) + return &srcp[i]; + + return NULL; +} diff --git a/compiler-rt/lib/builtins/absvdi2.c b/compiler-rt/lib/builtins/absvdi2.c index b9566cd..291ab5f 100644 --- a/compiler-rt/lib/builtins/absvdi2.c +++ b/compiler-rt/lib/builtins/absvdi2.c @@ -18,7 +18,7 @@ COMPILER_RT_ABI di_int __absvdi2(di_int a) { const int N = (int)(sizeof(di_int) * CHAR_BIT); - if (a == ((di_int)1 << (N - 1))) + if (a == ((di_int)((du_int)1 << (N - 1)))) compilerrt_abort(); const di_int t = a >> (N - 1); return (a ^ t) - t; diff --git a/compiler-rt/lib/builtins/absvsi2.c b/compiler-rt/lib/builtins/absvsi2.c index 9d5de7e..9977c33 100644 --- a/compiler-rt/lib/builtins/absvsi2.c +++ b/compiler-rt/lib/builtins/absvsi2.c @@ -18,7 +18,7 @@ COMPILER_RT_ABI si_int __absvsi2(si_int a) { const int N = (int)(sizeof(si_int) * CHAR_BIT); - if (a == ((si_int)1 << (N - 1))) + if (a == ((si_int)((su_int)1 << (N - 1)))) compilerrt_abort(); const si_int t = a >> (N - 1); return (a ^ t) - t; diff --git a/compiler-rt/lib/builtins/absvti2.c b/compiler-rt/lib/builtins/absvti2.c index 491d99d..bc6933b 100644 --- a/compiler-rt/lib/builtins/absvti2.c +++ b/compiler-rt/lib/builtins/absvti2.c @@ -20,7 +20,7 @@ COMPILER_RT_ABI ti_int __absvti2(ti_int a) { const int N = (int)(sizeof(ti_int) * CHAR_BIT); - if (a == ((ti_int)1 << (N - 1))) + if (a == (ti_int)((tu_int)1 << (N - 1))) compilerrt_abort(); const ti_int s = a >> (N - 1); return (a ^ s) - s; diff --git a/compiler-rt/lib/builtins/addtf3.c b/compiler-rt/lib/builtins/addtf3.c index 86e4f4c..2cb3a4d 100644 --- a/compiler-rt/lib/builtins/addtf3.c +++ b/compiler-rt/lib/builtins/addtf3.c @@ -13,7 +13,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) #include "fp_add_impl.inc" COMPILER_RT_ABI fp_t __addtf3(fp_t a, fp_t b) { diff --git a/compiler-rt/lib/builtins/arm/aeabi_cdcmp.S b/compiler-rt/lib/builtins/arm/aeabi_cdcmp.S index bd039a0..c7abdb0 100644 --- a/compiler-rt/lib/builtins/arm/aeabi_cdcmp.S +++ b/compiler-rt/lib/builtins/arm/aeabi_cdcmp.S @@ -8,10 +8,6 @@ #include "../assembly.h" -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ -#error big endian support not implemented -#endif - #define APSR_Z (1 << 30) #define APSR_C (1 << 29) diff --git a/compiler-rt/lib/builtins/arm/aeabi_cfcmp.S b/compiler-rt/lib/builtins/arm/aeabi_cfcmp.S index a26cb2a..81c4766 100644 --- a/compiler-rt/lib/builtins/arm/aeabi_cfcmp.S +++ b/compiler-rt/lib/builtins/arm/aeabi_cfcmp.S @@ -8,10 +8,6 @@ #include "../assembly.h" -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ -#error big endian support not implemented -#endif - #define APSR_Z (1 << 30) #define APSR_C (1 << 29) diff --git a/compiler-rt/lib/builtins/arm/divsi3.S b/compiler-rt/lib/builtins/arm/divsi3.S index 761bf49..faf9af9 100644 --- a/compiler-rt/lib/builtins/arm/divsi3.S +++ b/compiler-rt/lib/builtins/arm/divsi3.S @@ -37,7 +37,8 @@ DEFINE_COMPILERRT_FUNCTION(__divsi3) sdiv r0, r0, r1 bx lr LOCAL_LABEL(divzero): - mov r0,#0 + // Use movs for compatibility with v8-m.base. + movs r0,#0 bx lr #else ESTABLISH_FRAME diff --git a/compiler-rt/lib/builtins/arm/udivsi3.S b/compiler-rt/lib/builtins/arm/udivsi3.S index 9b1b035..16528e8 100644 --- a/compiler-rt/lib/builtins/arm/udivsi3.S +++ b/compiler-rt/lib/builtins/arm/udivsi3.S @@ -32,7 +32,8 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3) bx lr LOCAL_LABEL(divby0): - mov r0, #0 + // Use movs for compatibility with v8-m.base. + movs r0, #0 # ifdef __ARM_EABI__ b __aeabi_idiv0 # else @@ -203,7 +204,7 @@ LOCAL_LABEL(divby0): LOCAL_LABEL(block_skip_##shift) :; \ adcs r3, r3 // same as ((r3 << 1) | Carry). Carry is set if r0 >= r2. - // TODO: if current location counter is not not word aligned, we don't + // TODO: if current location counter is not word aligned, we don't // need the .p2align and nop // Label div0block must be word-aligned. First align block 31 .p2align 2 diff --git a/compiler-rt/lib/builtins/ashldi3.c b/compiler-rt/lib/builtins/ashldi3.c index 04f2222..7b835da 100644 --- a/compiler-rt/lib/builtins/ashldi3.c +++ b/compiler-rt/lib/builtins/ashldi3.c @@ -28,7 +28,8 @@ COMPILER_RT_ABI di_int __ashldi3(di_int a, int b) { if (b == 0) return a; result.s.low = input.s.low << b; - result.s.high = (input.s.high << b) | (input.s.low >> (bits_in_word - b)); + result.s.high = + ((su_int)input.s.high << b) | (input.s.low >> (bits_in_word - b)); } return result.all; } diff --git a/compiler-rt/lib/builtins/ashlti3.c b/compiler-rt/lib/builtins/ashlti3.c index 2d7bd4a..2bebf10 100644 --- a/compiler-rt/lib/builtins/ashlti3.c +++ b/compiler-rt/lib/builtins/ashlti3.c @@ -18,7 +18,7 @@ // Precondition: 0 <= b < bits_in_tword -COMPILER_RT_ABI ti_int __ashlti3(ti_int a, si_int b) { +COMPILER_RT_ABI ti_int __ashlti3(ti_int a, int b) { const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT); twords input; twords result; @@ -30,7 +30,8 @@ COMPILER_RT_ABI ti_int __ashlti3(ti_int a, si_int b) { if (b == 0) return a; result.s.low = input.s.low << b; - result.s.high = (input.s.high << b) | (input.s.low >> (bits_in_dword - b)); + result.s.high = + ((du_int)input.s.high << b) | (input.s.low >> (bits_in_dword - b)); } return result.all; } diff --git a/compiler-rt/lib/builtins/ashrdi3.c b/compiler-rt/lib/builtins/ashrdi3.c index 934a5c4..c0879b8 100644 --- a/compiler-rt/lib/builtins/ashrdi3.c +++ b/compiler-rt/lib/builtins/ashrdi3.c @@ -29,7 +29,8 @@ COMPILER_RT_ABI di_int __ashrdi3(di_int a, int b) { if (b == 0) return a; result.s.high = input.s.high >> b; - result.s.low = (input.s.high << (bits_in_word - b)) | (input.s.low >> b); + result.s.low = + ((su_int)input.s.high << (bits_in_word - b)) | (input.s.low >> b); } return result.all; } diff --git a/compiler-rt/lib/builtins/ashrti3.c b/compiler-rt/lib/builtins/ashrti3.c index f573b6d..d6b1ad9 100644 --- a/compiler-rt/lib/builtins/ashrti3.c +++ b/compiler-rt/lib/builtins/ashrti3.c @@ -18,7 +18,7 @@ // Precondition: 0 <= b < bits_in_tword -COMPILER_RT_ABI ti_int __ashrti3(ti_int a, si_int b) { +COMPILER_RT_ABI ti_int __ashrti3(ti_int a, int b) { const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT); twords input; twords result; @@ -31,7 +31,8 @@ COMPILER_RT_ABI ti_int __ashrti3(ti_int a, si_int b) { if (b == 0) return a; result.s.high = input.s.high >> b; - result.s.low = (input.s.high << (bits_in_dword - b)) | (input.s.low >> b); + result.s.low = + ((du_int)input.s.high << (bits_in_dword - b)) | (input.s.low >> b); } return result.all; } diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h index 69a3d86..8c42fc7 100644 --- a/compiler-rt/lib/builtins/assembly.h +++ b/compiler-rt/lib/builtins/assembly.h @@ -260,14 +260,15 @@ .globl name SEPARATOR \ SYMBOL_IS_FUNC(name) SEPARATOR \ DECLARE_SYMBOL_VISIBILITY_UNMANGLED(name) SEPARATOR \ - CFI_START SEPARATOR \ DECLARE_FUNC_ENCODING \ - name: SEPARATOR BTI_C + name: \ + SEPARATOR CFI_START \ + SEPARATOR BTI_C #define DEFINE_COMPILERRT_FUNCTION_ALIAS(name, target) \ .globl SYMBOL_NAME(name) SEPARATOR \ SYMBOL_IS_FUNC(SYMBOL_NAME(name)) SEPARATOR \ - DECLARE_SYMBOL_VISIBILITY(SYMBOL_NAME(name)) SEPARATOR \ + DECLARE_SYMBOL_VISIBILITY(name) SEPARATOR \ .set SYMBOL_NAME(name), SYMBOL_NAME(target) SEPARATOR #if defined(__ARM_EABI__) diff --git a/compiler-rt/lib/builtins/atomic.c b/compiler-rt/lib/builtins/atomic.c index 852bb20..aded25d 100644 --- a/compiler-rt/lib/builtins/atomic.c +++ b/compiler-rt/lib/builtins/atomic.c @@ -12,7 +12,7 @@ // // 1) This code must work with C programs that do not link to anything // (including pthreads) and so it should not depend on any pthread -// functions. +// functions. If the user wishes to opt into using pthreads, they may do so. // 2) Atomic operations, rather than explicit mutexes, are most commonly used // on code where contended operations are rate. // @@ -56,7 +56,17 @@ static const long SPINLOCK_MASK = SPINLOCK_COUNT - 1; // defined. Each platform should define the Lock type, and corresponding // lock() and unlock() functions. //////////////////////////////////////////////////////////////////////////////// -#if defined(__FreeBSD__) || defined(__DragonFly__) +#if defined(_LIBATOMIC_USE_PTHREAD) +#include +typedef pthread_mutex_t Lock; +/// Unlock a lock. This is a release operation. +__inline static void unlock(Lock *l) { pthread_mutex_unlock(l); } +/// Locks a lock. +__inline static void lock(Lock *l) { pthread_mutex_lock(l); } +/// locks for atomic operations +static Lock locks[SPINLOCK_COUNT]; + +#elif defined(__FreeBSD__) || defined(__DragonFly__) #include // clang-format off #include diff --git a/compiler-rt/lib/builtins/clear_cache.c b/compiler-rt/lib/builtins/clear_cache.c index 8993761..2ac99b2 100644 --- a/compiler-rt/lib/builtins/clear_cache.c +++ b/compiler-rt/lib/builtins/clear_cache.c @@ -110,10 +110,14 @@ void __clear_cache(void *start, void *end) { "jr.hb $at\n" "move $at, $0\n" ".set at"); -#else +#elif defined(__linux__) || defined(__OpenBSD__) // Pre-R6 may not be globalized. And some implementations may give strange // synci_step. So, let's use libc call for it. - cacheflush(start, end_int - start_int, BCACHE); + _flush_cache(start, end_int - start_int, BCACHE); +#else + (void)start_int; + (void)end_int; + compilerrt_abort(); #endif } #elif defined(__aarch64__) && !defined(__APPLE__) diff --git a/compiler-rt/lib/builtins/comparetf2.c b/compiler-rt/lib/builtins/comparetf2.c index f159245..be5e9e5 100644 --- a/compiler-rt/lib/builtins/comparetf2.c +++ b/compiler-rt/lib/builtins/comparetf2.c @@ -39,7 +39,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) #include "fp_compare_impl.inc" COMPILER_RT_ABI CMP_RESULT __letf2(fp_t a, fp_t b) { return __leXf2__(a, b); } diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c deleted file mode 100644 index f5ad530..0000000 --- a/compiler-rt/lib/builtins/cpu_model.c +++ /dev/null @@ -1,1357 +0,0 @@ -//===-- cpu_model.c - Support for __cpu_model builtin ------------*- C -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is based on LLVM's lib/Support/Host.cpp. -// It implements the operating system Host concept and builtin -// __cpu_model for the compiler_rt library for x86 and -// __aarch64_have_lse_atomics, __aarch64_cpu_features for AArch64. -// -//===----------------------------------------------------------------------===// - -#ifndef __has_attribute -#define __has_attribute(attr) 0 -#endif - -#if __has_attribute(constructor) -#if __GNUC__ >= 9 -// Ordinarily init priorities below 101 are disallowed as they are reserved for the -// implementation. However, we are the implementation, so silence the diagnostic, -// since it doesn't apply to us. -#pragma GCC diagnostic ignored "-Wprio-ctor-dtor" -#endif -// We're choosing init priority 90 to force our constructors to run before any -// constructors in the end user application (starting at priority 101). This value -// matches the libgcc choice for the same functions. -#define CONSTRUCTOR_ATTRIBUTE __attribute__((constructor(90))) -#else -// FIXME: For MSVC, we should make a function pointer global in .CRT$X?? so that -// this runs during initialization. -#define CONSTRUCTOR_ATTRIBUTE -#endif - -#if (defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ - defined(_M_X64)) && \ - (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) - -#include - -#define bool int -#define true 1 -#define false 0 - -#ifdef _MSC_VER -#include -#endif - -enum VendorSignatures { - SIG_INTEL = 0x756e6547, // Genu - SIG_AMD = 0x68747541, // Auth -}; - -enum ProcessorVendors { - VENDOR_INTEL = 1, - VENDOR_AMD, - VENDOR_OTHER, - VENDOR_MAX -}; - -enum ProcessorTypes { - INTEL_BONNELL = 1, - INTEL_CORE2, - INTEL_COREI7, - AMDFAM10H, - AMDFAM15H, - INTEL_SILVERMONT, - INTEL_KNL, - AMD_BTVER1, - AMD_BTVER2, - AMDFAM17H, - INTEL_KNM, - INTEL_GOLDMONT, - INTEL_GOLDMONT_PLUS, - INTEL_TREMONT, - AMDFAM19H, - ZHAOXIN_FAM7H, - INTEL_SIERRAFOREST, - INTEL_GRANDRIDGE, - CPU_TYPE_MAX -}; - -enum ProcessorSubtypes { - INTEL_COREI7_NEHALEM = 1, - INTEL_COREI7_WESTMERE, - INTEL_COREI7_SANDYBRIDGE, - AMDFAM10H_BARCELONA, - AMDFAM10H_SHANGHAI, - AMDFAM10H_ISTANBUL, - AMDFAM15H_BDVER1, - AMDFAM15H_BDVER2, - AMDFAM15H_BDVER3, - AMDFAM15H_BDVER4, - AMDFAM17H_ZNVER1, - INTEL_COREI7_IVYBRIDGE, - INTEL_COREI7_HASWELL, - INTEL_COREI7_BROADWELL, - INTEL_COREI7_SKYLAKE, - INTEL_COREI7_SKYLAKE_AVX512, - INTEL_COREI7_CANNONLAKE, - INTEL_COREI7_ICELAKE_CLIENT, - INTEL_COREI7_ICELAKE_SERVER, - AMDFAM17H_ZNVER2, - INTEL_COREI7_CASCADELAKE, - INTEL_COREI7_TIGERLAKE, - INTEL_COREI7_COOPERLAKE, - INTEL_COREI7_SAPPHIRERAPIDS, - INTEL_COREI7_ALDERLAKE, - AMDFAM19H_ZNVER3, - INTEL_COREI7_ROCKETLAKE, - ZHAOXIN_FAM7H_LUJIAZUI, - AMDFAM19H_ZNVER4, - INTEL_COREI7_GRANITERAPIDS, - CPU_SUBTYPE_MAX -}; - -enum ProcessorFeatures { - FEATURE_CMOV = 0, - FEATURE_MMX, - FEATURE_POPCNT, - FEATURE_SSE, - FEATURE_SSE2, - FEATURE_SSE3, - FEATURE_SSSE3, - FEATURE_SSE4_1, - FEATURE_SSE4_2, - FEATURE_AVX, - FEATURE_AVX2, - FEATURE_SSE4_A, - FEATURE_FMA4, - FEATURE_XOP, - FEATURE_FMA, - FEATURE_AVX512F, - FEATURE_BMI, - FEATURE_BMI2, - FEATURE_AES, - FEATURE_PCLMUL, - FEATURE_AVX512VL, - FEATURE_AVX512BW, - FEATURE_AVX512DQ, - FEATURE_AVX512CD, - FEATURE_AVX512ER, - FEATURE_AVX512PF, - FEATURE_AVX512VBMI, - FEATURE_AVX512IFMA, - FEATURE_AVX5124VNNIW, - FEATURE_AVX5124FMAPS, - FEATURE_AVX512VPOPCNTDQ, - FEATURE_AVX512VBMI2, - FEATURE_GFNI, - FEATURE_VPCLMULQDQ, - FEATURE_AVX512VNNI, - FEATURE_AVX512BITALG, - FEATURE_AVX512BF16, - FEATURE_AVX512VP2INTERSECT, - CPU_FEATURE_MAX -}; - -// The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max). -// Check motivated by bug reports for OpenSSL crashing on CPUs without CPUID -// support. Consequently, for i386, the presence of CPUID is checked first -// via the corresponding eflags bit. -static bool isCpuIdSupported(void) { -#if defined(__GNUC__) || defined(__clang__) -#if defined(__i386__) - int __cpuid_supported; - __asm__(" pushfl\n" - " popl %%eax\n" - " movl %%eax,%%ecx\n" - " xorl $0x00200000,%%eax\n" - " pushl %%eax\n" - " popfl\n" - " pushfl\n" - " popl %%eax\n" - " movl $0,%0\n" - " cmpl %%eax,%%ecx\n" - " je 1f\n" - " movl $1,%0\n" - "1:" - : "=r"(__cpuid_supported) - : - : "eax", "ecx"); - if (!__cpuid_supported) - return false; -#endif - return true; -#endif - return true; -} - -// This code is copied from lib/Support/Host.cpp. -// Changes to either file should be mirrored in the other. - -/// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in -/// the specified arguments. If we can't run cpuid on the host, return true. -static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX, - unsigned *rECX, unsigned *rEDX) { -#if defined(__GNUC__) || defined(__clang__) -#if defined(__x86_64__) - // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually. - // FIXME: should we save this for Clang? - __asm__("movq\t%%rbx, %%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx, %%rsi\n\t" - : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX) - : "a"(value)); - return false; -#elif defined(__i386__) - __asm__("movl\t%%ebx, %%esi\n\t" - "cpuid\n\t" - "xchgl\t%%ebx, %%esi\n\t" - : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX) - : "a"(value)); - return false; -#else - return true; -#endif -#elif defined(_MSC_VER) - // The MSVC intrinsic is portable across x86 and x64. - int registers[4]; - __cpuid(registers, value); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; -#else - return true; -#endif -} - -/// getX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return -/// the 4 values in the specified arguments. If we can't run cpuid on the host, -/// return true. -static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf, - unsigned *rEAX, unsigned *rEBX, unsigned *rECX, - unsigned *rEDX) { -#if defined(__GNUC__) || defined(__clang__) -#if defined(__x86_64__) - // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually. - // FIXME: should we save this for Clang? - __asm__("movq\t%%rbx, %%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx, %%rsi\n\t" - : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX) - : "a"(value), "c"(subleaf)); - return false; -#elif defined(__i386__) - __asm__("movl\t%%ebx, %%esi\n\t" - "cpuid\n\t" - "xchgl\t%%ebx, %%esi\n\t" - : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX) - : "a"(value), "c"(subleaf)); - return false; -#else - return true; -#endif -#elif defined(_MSC_VER) - int registers[4]; - __cpuidex(registers, value, subleaf); - *rEAX = registers[0]; - *rEBX = registers[1]; - *rECX = registers[2]; - *rEDX = registers[3]; - return false; -#else - return true; -#endif -} - -// Read control register 0 (XCR0). Used to detect features such as AVX. -static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) { -#if defined(__GNUC__) || defined(__clang__) - // Check xgetbv; this uses a .byte sequence instead of the instruction - // directly because older assemblers do not include support for xgetbv and - // there is no easy way to conditionally compile based on the assembler used. - __asm__(".byte 0x0f, 0x01, 0xd0" : "=a"(*rEAX), "=d"(*rEDX) : "c"(0)); - return false; -#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) - unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); - *rEAX = Result; - *rEDX = Result >> 32; - return false; -#else - return true; -#endif -} - -static void detectX86FamilyModel(unsigned EAX, unsigned *Family, - unsigned *Model) { - *Family = (EAX >> 8) & 0xf; // Bits 8 - 11 - *Model = (EAX >> 4) & 0xf; // Bits 4 - 7 - if (*Family == 6 || *Family == 0xf) { - if (*Family == 0xf) - // Examine extended family ID if family ID is F. - *Family += (EAX >> 20) & 0xff; // Bits 20 - 27 - // Examine extended model ID if family ID is 6 or F. - *Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19 - } -} - -static const char * -getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, - const unsigned *Features, - unsigned *Type, unsigned *Subtype) { -#define testFeature(F) \ - (Features[F / 32] & (1 << (F % 32))) != 0 - - // We select CPU strings to match the code in Host.cpp, but we don't use them - // in compiler-rt. - const char *CPU = 0; - - switch (Family) { - case 6: - switch (Model) { - case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile - // processor, Intel Core 2 Quad processor, Intel Core 2 Quad - // mobile processor, Intel Core 2 Extreme processor, Intel - // Pentium Dual-Core processor, Intel Xeon processor, model - // 0Fh. All processors are manufactured using the 65 nm process. - case 0x16: // Intel Celeron processor model 16h. All processors are - // manufactured using the 65 nm process - CPU = "core2"; - *Type = INTEL_CORE2; - break; - case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model - // 17h. All processors are manufactured using the 45 nm process. - // - // 45nm: Penryn , Wolfdale, Yorkfield (XE) - case 0x1d: // Intel Xeon processor MP. All processors are manufactured using - // the 45 nm process. - CPU = "penryn"; - *Type = INTEL_CORE2; - break; - case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All - // processors are manufactured using the 45 nm process. - case 0x1e: // Intel(R) Core(TM) i7 CPU 870 @ 2.93GHz. - // As found in a Summer 2010 model iMac. - case 0x1f: - case 0x2e: // Nehalem EX - CPU = "nehalem"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_NEHALEM; - break; - case 0x25: // Intel Core i7, laptop version. - case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All - // processors are manufactured using the 32 nm process. - case 0x2f: // Westmere EX - CPU = "westmere"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_WESTMERE; - break; - case 0x2a: // Intel Core i7 processor. All processors are manufactured - // using the 32 nm process. - case 0x2d: - CPU = "sandybridge"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_SANDYBRIDGE; - break; - case 0x3a: - case 0x3e: // Ivy Bridge EP - CPU = "ivybridge"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_IVYBRIDGE; - break; - - // Haswell: - case 0x3c: - case 0x3f: - case 0x45: - case 0x46: - CPU = "haswell"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_HASWELL; - break; - - // Broadwell: - case 0x3d: - case 0x47: - case 0x4f: - case 0x56: - CPU = "broadwell"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_BROADWELL; - break; - - // Skylake: - case 0x4e: // Skylake mobile - case 0x5e: // Skylake desktop - case 0x8e: // Kaby Lake mobile - case 0x9e: // Kaby Lake desktop - case 0xa5: // Comet Lake-H/S - case 0xa6: // Comet Lake-U - CPU = "skylake"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_SKYLAKE; - break; - - // Rocketlake: - case 0xa7: - CPU = "rocketlake"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_ROCKETLAKE; - break; - - // Skylake Xeon: - case 0x55: - *Type = INTEL_COREI7; - if (testFeature(FEATURE_AVX512BF16)) { - CPU = "cooperlake"; - *Subtype = INTEL_COREI7_COOPERLAKE; - } else if (testFeature(FEATURE_AVX512VNNI)) { - CPU = "cascadelake"; - *Subtype = INTEL_COREI7_CASCADELAKE; - } else { - CPU = "skylake-avx512"; - *Subtype = INTEL_COREI7_SKYLAKE_AVX512; - } - break; - - // Cannonlake: - case 0x66: - CPU = "cannonlake"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_CANNONLAKE; - break; - - // Icelake: - case 0x7d: - case 0x7e: - CPU = "icelake-client"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_ICELAKE_CLIENT; - break; - - // Tigerlake: - case 0x8c: - case 0x8d: - CPU = "tigerlake"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_TIGERLAKE; - break; - - // Alderlake: - case 0x97: - case 0x9a: - // Raptorlake: - case 0xb7: - // Meteorlake: - case 0xaa: - case 0xac: - CPU = "alderlake"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_ALDERLAKE; - break; - - // Icelake Xeon: - case 0x6a: - case 0x6c: - CPU = "icelake-server"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_ICELAKE_SERVER; - break; - - // Emerald Rapids: - case 0xcf: - // Sapphire Rapids: - case 0x8f: - CPU = "sapphirerapids"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_SAPPHIRERAPIDS; - break; - - // Granite Rapids: - case 0xae: - case 0xad: - CPU = "graniterapids"; - *Type = INTEL_COREI7; - *Subtype = INTEL_COREI7_GRANITERAPIDS; - break; - - case 0x1c: // Most 45 nm Intel Atom processors - case 0x26: // 45 nm Atom Lincroft - case 0x27: // 32 nm Atom Medfield - case 0x35: // 32 nm Atom Midview - case 0x36: // 32 nm Atom Midview - CPU = "bonnell"; - *Type = INTEL_BONNELL; - break; - - // Atom Silvermont codes from the Intel software optimization guide. - case 0x37: - case 0x4a: - case 0x4d: - case 0x5a: - case 0x5d: - case 0x4c: // really airmont - CPU = "silvermont"; - *Type = INTEL_SILVERMONT; - break; - // Goldmont: - case 0x5c: // Apollo Lake - case 0x5f: // Denverton - CPU = "goldmont"; - *Type = INTEL_GOLDMONT; - break; // "goldmont" - case 0x7a: - CPU = "goldmont-plus"; - *Type = INTEL_GOLDMONT_PLUS; - break; - case 0x86: - CPU = "tremont"; - *Type = INTEL_TREMONT; - break; - - // Sierraforest: - case 0xaf: - CPU = "sierraforest"; - *Type = INTEL_SIERRAFOREST; - break; - - // Grandridge: - case 0xb6: - CPU = "grandridge"; - *Type = INTEL_GRANDRIDGE; - break; - - case 0x57: - CPU = "knl"; - *Type = INTEL_KNL; - break; - - case 0x85: - CPU = "knm"; - *Type = INTEL_KNM; - break; - - default: // Unknown family 6 CPU. - break; - } - break; - default: - break; // Unknown. - } - - return CPU; -} - -static const char * -getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model, - const unsigned *Features, - unsigned *Type, unsigned *Subtype) { - // We select CPU strings to match the code in Host.cpp, but we don't use them - // in compiler-rt. - const char *CPU = 0; - - switch (Family) { - case 16: - CPU = "amdfam10"; - *Type = AMDFAM10H; - switch (Model) { - case 2: - *Subtype = AMDFAM10H_BARCELONA; - break; - case 4: - *Subtype = AMDFAM10H_SHANGHAI; - break; - case 8: - *Subtype = AMDFAM10H_ISTANBUL; - break; - } - break; - case 20: - CPU = "btver1"; - *Type = AMD_BTVER1; - break; - case 21: - CPU = "bdver1"; - *Type = AMDFAM15H; - if (Model >= 0x60 && Model <= 0x7f) { - CPU = "bdver4"; - *Subtype = AMDFAM15H_BDVER4; - break; // 60h-7Fh: Excavator - } - if (Model >= 0x30 && Model <= 0x3f) { - CPU = "bdver3"; - *Subtype = AMDFAM15H_BDVER3; - break; // 30h-3Fh: Steamroller - } - if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) { - CPU = "bdver2"; - *Subtype = AMDFAM15H_BDVER2; - break; // 02h, 10h-1Fh: Piledriver - } - if (Model <= 0x0f) { - *Subtype = AMDFAM15H_BDVER1; - break; // 00h-0Fh: Bulldozer - } - break; - case 22: - CPU = "btver2"; - *Type = AMD_BTVER2; - break; - case 23: - CPU = "znver1"; - *Type = AMDFAM17H; - if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) { - CPU = "znver2"; - *Subtype = AMDFAM17H_ZNVER2; - break; // 30h-3fh, 71h: Zen2 - } - if (Model <= 0x0f) { - *Subtype = AMDFAM17H_ZNVER1; - break; // 00h-0Fh: Zen1 - } - break; - case 25: - CPU = "znver3"; - *Type = AMDFAM19H; - if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x5f)) { - // Family 19h Models 00h-0Fh - Zen3 - // Family 19h Models 20h-2Fh - Zen3 - // Family 19h Models 30h-3Fh - Zen3 - // Family 19h Models 40h-4Fh - Zen3+ - // Family 19h Models 50h-5Fh - Zen3+ - *Subtype = AMDFAM19H_ZNVER3; - break; - } - if ((Model >= 0x10 && Model <= 0x1f) || - (Model >= 0x60 && Model <= 0x74) || - (Model >= 0x78 && Model <= 0x7b) || - (Model >= 0xA0 && Model <= 0xAf)) { - CPU = "znver4"; - *Subtype = AMDFAM19H_ZNVER4; - break; // "znver4" - } - break; - default: - break; // Unknown AMD CPU. - } - - return CPU; -} - -static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, - unsigned *Features) { - unsigned EAX, EBX; - -#define setFeature(F) \ - Features[F / 32] |= 1U << (F % 32) - - if ((EDX >> 15) & 1) - setFeature(FEATURE_CMOV); - if ((EDX >> 23) & 1) - setFeature(FEATURE_MMX); - if ((EDX >> 25) & 1) - setFeature(FEATURE_SSE); - if ((EDX >> 26) & 1) - setFeature(FEATURE_SSE2); - - if ((ECX >> 0) & 1) - setFeature(FEATURE_SSE3); - if ((ECX >> 1) & 1) - setFeature(FEATURE_PCLMUL); - if ((ECX >> 9) & 1) - setFeature(FEATURE_SSSE3); - if ((ECX >> 12) & 1) - setFeature(FEATURE_FMA); - if ((ECX >> 19) & 1) - setFeature(FEATURE_SSE4_1); - if ((ECX >> 20) & 1) - setFeature(FEATURE_SSE4_2); - if ((ECX >> 23) & 1) - setFeature(FEATURE_POPCNT); - if ((ECX >> 25) & 1) - setFeature(FEATURE_AES); - - // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV - // indicates that the AVX registers will be saved and restored on context - // switch, then we have full AVX support. - const unsigned AVXBits = (1 << 27) | (1 << 28); - bool HasAVX = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) && - ((EAX & 0x6) == 0x6); -#if defined(__APPLE__) - // Darwin lazily saves the AVX512 context on first use: trust that the OS will - // save the AVX512 context if we use AVX512 instructions, even the bit is not - // set right now. - bool HasAVX512Save = true; -#else - // AVX512 requires additional context to be saved by the OS. - bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0); -#endif - - if (HasAVX) - setFeature(FEATURE_AVX); - - bool HasLeaf7 = - MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX); - - if (HasLeaf7 && ((EBX >> 3) & 1)) - setFeature(FEATURE_BMI); - if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVX) - setFeature(FEATURE_AVX2); - if (HasLeaf7 && ((EBX >> 8) & 1)) - setFeature(FEATURE_BMI2); - if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512F); - if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512DQ); - if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512IFMA); - if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512PF); - if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512ER); - if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512CD); - if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512BW); - if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512VL); - - if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512VBMI); - if (HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512VBMI2); - if (HasLeaf7 && ((ECX >> 8) & 1)) - setFeature(FEATURE_GFNI); - if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVX) - setFeature(FEATURE_VPCLMULQDQ); - if (HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512VNNI); - if (HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512BITALG); - if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512VPOPCNTDQ); - - if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX5124VNNIW); - if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX5124FMAPS); - if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512VP2INTERSECT); - - bool HasLeaf7Subleaf1 = - MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX); - if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save) - setFeature(FEATURE_AVX512BF16); - - unsigned MaxExtLevel; - getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX); - - bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 && - !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); - if (HasExtLeaf1 && ((ECX >> 6) & 1)) - setFeature(FEATURE_SSE4_A); - if (HasExtLeaf1 && ((ECX >> 11) & 1)) - setFeature(FEATURE_XOP); - if (HasExtLeaf1 && ((ECX >> 16) & 1)) - setFeature(FEATURE_FMA4); -#undef setFeature -} - -#ifndef _WIN32 -__attribute__((visibility("hidden"))) -#endif -int __cpu_indicator_init(void) CONSTRUCTOR_ATTRIBUTE; - -#ifndef _WIN32 -__attribute__((visibility("hidden"))) -#endif -struct __processor_model { - unsigned int __cpu_vendor; - unsigned int __cpu_type; - unsigned int __cpu_subtype; - unsigned int __cpu_features[1]; -} __cpu_model = {0, 0, 0, {0}}; - -#ifndef _WIN32 -__attribute__((visibility("hidden"))) -#endif -unsigned int __cpu_features2 = 0; - -// A constructor function that is sets __cpu_model and __cpu_features2 with -// the right values. This needs to run only once. This constructor is -// given the highest priority and it should run before constructors without -// the priority set. However, it still runs after ifunc initializers and -// needs to be called explicitly there. - -int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { - unsigned EAX, EBX, ECX, EDX; - unsigned MaxLeaf = 5; - unsigned Vendor; - unsigned Model, Family; - unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0}; - - // This function needs to run just once. - if (__cpu_model.__cpu_vendor) - return 0; - - if (!isCpuIdSupported() || - getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) { - __cpu_model.__cpu_vendor = VENDOR_OTHER; - return -1; - } - - getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX); - detectX86FamilyModel(EAX, &Family, &Model); - - // Find available features. - getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]); - - assert((sizeof(Features)/sizeof(Features[0])) == 2); - __cpu_model.__cpu_features[0] = Features[0]; - __cpu_features2 = Features[1]; - - if (Vendor == SIG_INTEL) { - // Get CPU type. - getIntelProcessorTypeAndSubtype(Family, Model, &Features[0], - &(__cpu_model.__cpu_type), - &(__cpu_model.__cpu_subtype)); - __cpu_model.__cpu_vendor = VENDOR_INTEL; - } else if (Vendor == SIG_AMD) { - // Get CPU type. - getAMDProcessorTypeAndSubtype(Family, Model, &Features[0], - &(__cpu_model.__cpu_type), - &(__cpu_model.__cpu_subtype)); - __cpu_model.__cpu_vendor = VENDOR_AMD; - } else - __cpu_model.__cpu_vendor = VENDOR_OTHER; - - assert(__cpu_model.__cpu_vendor < VENDOR_MAX); - assert(__cpu_model.__cpu_type < CPU_TYPE_MAX); - assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX); - - return 0; -} -#elif defined(__aarch64__) - -#ifndef AT_HWCAP -#define AT_HWCAP 16 -#endif -#ifndef HWCAP_CPUID -#define HWCAP_CPUID (1 << 11) -#endif -#ifndef HWCAP_FP -#define HWCAP_FP (1 << 0) -#endif -#ifndef HWCAP_ASIMD -#define HWCAP_ASIMD (1 << 1) -#endif -#ifndef HWCAP_AES -#define HWCAP_AES (1 << 3) -#endif -#ifndef HWCAP_PMULL -#define HWCAP_PMULL (1 << 4) -#endif -#ifndef HWCAP_SHA1 -#define HWCAP_SHA1 (1 << 5) -#endif -#ifndef HWCAP_SHA2 -#define HWCAP_SHA2 (1 << 6) -#endif -#ifndef HWCAP_ATOMICS -#define HWCAP_ATOMICS (1 << 8) -#endif -#ifndef HWCAP_FPHP -#define HWCAP_FPHP (1 << 9) -#endif -#ifndef HWCAP_ASIMDHP -#define HWCAP_ASIMDHP (1 << 10) -#endif -#ifndef HWCAP_ASIMDRDM -#define HWCAP_ASIMDRDM (1 << 12) -#endif -#ifndef HWCAP_JSCVT -#define HWCAP_JSCVT (1 << 13) -#endif -#ifndef HWCAP_FCMA -#define HWCAP_FCMA (1 << 14) -#endif -#ifndef HWCAP_LRCPC -#define HWCAP_LRCPC (1 << 15) -#endif -#ifndef HWCAP_DCPOP -#define HWCAP_DCPOP (1 << 16) -#endif -#ifndef HWCAP_SHA3 -#define HWCAP_SHA3 (1 << 17) -#endif -#ifndef HWCAP_SM3 -#define HWCAP_SM3 (1 << 18) -#endif -#ifndef HWCAP_SM4 -#define HWCAP_SM4 (1 << 19) -#endif -#ifndef HWCAP_ASIMDDP -#define HWCAP_ASIMDDP (1 << 20) -#endif -#ifndef HWCAP_SHA512 -#define HWCAP_SHA512 (1 << 21) -#endif -#ifndef HWCAP_SVE -#define HWCAP_SVE (1 << 22) -#endif -#ifndef HWCAP_ASIMDFHM -#define HWCAP_ASIMDFHM (1 << 23) -#endif -#ifndef HWCAP_DIT -#define HWCAP_DIT (1 << 24) -#endif -#ifndef HWCAP_ILRCPC -#define HWCAP_ILRCPC (1 << 26) -#endif -#ifndef HWCAP_FLAGM -#define HWCAP_FLAGM (1 << 27) -#endif -#ifndef HWCAP_SSBS -#define HWCAP_SSBS (1 << 28) -#endif -#ifndef HWCAP_SB -#define HWCAP_SB (1 << 29) -#endif - -#ifndef AT_HWCAP2 -#define AT_HWCAP2 26 -#endif -#ifndef HWCAP2_DCPODP -#define HWCAP2_DCPODP (1 << 0) -#endif -#ifndef HWCAP2_SVE2 -#define HWCAP2_SVE2 (1 << 1) -#endif -#ifndef HWCAP2_SVEAES -#define HWCAP2_SVEAES (1 << 2) -#endif -#ifndef HWCAP2_SVEPMULL -#define HWCAP2_SVEPMULL (1 << 3) -#endif -#ifndef HWCAP2_SVEBITPERM -#define HWCAP2_SVEBITPERM (1 << 4) -#endif -#ifndef HWCAP2_SVESHA3 -#define HWCAP2_SVESHA3 (1 << 5) -#endif -#ifndef HWCAP2_SVESM4 -#define HWCAP2_SVESM4 (1 << 6) -#endif -#ifndef HWCAP2_FLAGM2 -#define HWCAP2_FLAGM2 (1 << 7) -#endif -#ifndef HWCAP2_FRINT -#define HWCAP2_FRINT (1 << 8) -#endif -#ifndef HWCAP2_SVEI8MM -#define HWCAP2_SVEI8MM (1 << 9) -#endif -#ifndef HWCAP2_SVEF32MM -#define HWCAP2_SVEF32MM (1 << 10) -#endif -#ifndef HWCAP2_SVEF64MM -#define HWCAP2_SVEF64MM (1 << 11) -#endif -#ifndef HWCAP2_SVEBF16 -#define HWCAP2_SVEBF16 (1 << 12) -#endif -#ifndef HWCAP2_I8MM -#define HWCAP2_I8MM (1 << 13) -#endif -#ifndef HWCAP2_BF16 -#define HWCAP2_BF16 (1 << 14) -#endif -#ifndef HWCAP2_DGH -#define HWCAP2_DGH (1 << 15) -#endif -#ifndef HWCAP2_RNG -#define HWCAP2_RNG (1 << 16) -#endif -#ifndef HWCAP2_BTI -#define HWCAP2_BTI (1 << 17) -#endif -#ifndef HWCAP2_MTE -#define HWCAP2_MTE (1 << 18) -#endif -#ifndef HWCAP2_RPRES -#define HWCAP2_RPRES (1 << 21) -#endif -#ifndef HWCAP2_MTE3 -#define HWCAP2_MTE3 (1 << 22) -#endif -#ifndef HWCAP2_SME -#define HWCAP2_SME (1 << 23) -#endif -#ifndef HWCAP2_SME_I16I64 -#define HWCAP2_SME_I16I64 (1 << 24) -#endif -#ifndef HWCAP2_SME_F64F64 -#define HWCAP2_SME_F64F64 (1 << 25) -#endif -#ifndef HWCAP2_WFXT -#define HWCAP2_WFXT (1UL << 31) -#endif -#ifndef HWCAP2_EBF16 -#define HWCAP2_EBF16 (1UL << 32) -#endif -#ifndef HWCAP2_SVE_EBF16 -#define HWCAP2_SVE_EBF16 (1UL << 33) -#endif - -// LSE support detection for out-of-line atomics -// using HWCAP and Auxiliary vector -_Bool __aarch64_have_lse_atomics - __attribute__((visibility("hidden"), nocommon)); - -#if defined(__has_include) -#if __has_include() -#include -#if __has_include() -#include - -#if defined(__ANDROID__) -#include -#include -#elif defined(__Fuchsia__) -#include -#include -#endif - -// Detect Exynos 9810 CPU -#define IF_EXYNOS9810 \ - char arch[PROP_VALUE_MAX]; \ - if (__system_property_get("ro.arch", arch) > 0 && \ - strncmp(arch, "exynos9810", sizeof("exynos9810") - 1) == 0) - -static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) { -#if defined(__FreeBSD__) - unsigned long hwcap; - int result = elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); - __aarch64_have_lse_atomics = result == 0 && (hwcap & HWCAP_ATOMICS) != 0; -#elif defined(__Fuchsia__) - // This ensures the vDSO is a direct link-time dependency of anything that - // needs this initializer code. -#pragma comment(lib, "zircon") - uint32_t features; - zx_status_t status = _zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); - __aarch64_have_lse_atomics = - status == ZX_OK && (features & ZX_ARM64_FEATURE_ISA_ATOMICS) != 0; -#else - unsigned long hwcap = getauxval(AT_HWCAP); - _Bool result = (hwcap & HWCAP_ATOMICS) != 0; -#if defined(__ANDROID__) - if (result) { - // Some cores in the Exynos 9810 CPU are ARMv8.2 and others are ARMv8.0; - // only the former support LSE atomics. However, the kernel in the - // initial Android 8.0 release of Galaxy S9/S9+ devices incorrectly - // reported the feature as being supported. - // - // The kernel appears to have been corrected to mark it unsupported as of - // the Android 9.0 release on those devices, and this issue has not been - // observed anywhere else. Thus, this workaround may be removed if - // compiler-rt ever drops support for Android 8.0. - IF_EXYNOS9810 result = false; - } -#endif // defined(__ANDROID__) - __aarch64_have_lse_atomics = result; -#endif // defined(__FreeBSD__) -} - -#if !defined(DISABLE_AARCH64_FMV) -// CPUFeatures must correspond to the same AArch64 features in -// AArch64TargetParser.h -enum CPUFeatures { - FEAT_RNG, - FEAT_FLAGM, - FEAT_FLAGM2, - FEAT_FP16FML, - FEAT_DOTPROD, - FEAT_SM4, - FEAT_RDM, - FEAT_LSE, - FEAT_FP, - FEAT_SIMD, - FEAT_CRC, - FEAT_SHA1, - FEAT_SHA2, - FEAT_SHA3, - FEAT_AES, - FEAT_PMULL, - FEAT_FP16, - FEAT_DIT, - FEAT_DPB, - FEAT_DPB2, - FEAT_JSCVT, - FEAT_FCMA, - FEAT_RCPC, - FEAT_RCPC2, - FEAT_FRINTTS, - FEAT_DGH, - FEAT_I8MM, - FEAT_BF16, - FEAT_EBF16, - FEAT_RPRES, - FEAT_SVE, - FEAT_SVE_BF16, - FEAT_SVE_EBF16, - FEAT_SVE_I8MM, - FEAT_SVE_F32MM, - FEAT_SVE_F64MM, - FEAT_SVE2, - FEAT_SVE_AES, - FEAT_SVE_PMULL128, - FEAT_SVE_BITPERM, - FEAT_SVE_SHA3, - FEAT_SVE_SM4, - FEAT_SME, - FEAT_MEMTAG, - FEAT_MEMTAG2, - FEAT_MEMTAG3, - FEAT_SB, - FEAT_PREDRES, - FEAT_SSBS, - FEAT_SSBS2, - FEAT_BTI, - FEAT_LS64, - FEAT_LS64_V, - FEAT_LS64_ACCDATA, - FEAT_WFXT, - FEAT_SME_F64, - FEAT_SME_I64, - FEAT_SME2, - FEAT_MAX -}; - -// Architecture features used -// in Function Multi Versioning -struct { - unsigned long long features; - // As features grows new fields could be added -} __aarch64_cpu_features __attribute__((visibility("hidden"), nocommon)); - -void init_cpu_features_resolver(unsigned long hwcap, unsigned long hwcap2) { -#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F -#define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr)) -#define extractBits(val, start, number) \ - (val & ((1ULL << number) - 1ULL) << start) >> start - if (hwcap & HWCAP_CRC32) - setCPUFeature(FEAT_CRC); - if (hwcap & HWCAP_PMULL) - setCPUFeature(FEAT_PMULL); - if (hwcap & HWCAP_FLAGM) - setCPUFeature(FEAT_FLAGM); - if (hwcap2 & HWCAP2_FLAGM2) { - setCPUFeature(FEAT_FLAGM); - setCPUFeature(FEAT_FLAGM2); - } - if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4) - setCPUFeature(FEAT_SM4); - if (hwcap & HWCAP_ASIMDDP) - setCPUFeature(FEAT_DOTPROD); - if (hwcap & HWCAP_ASIMDFHM) - setCPUFeature(FEAT_FP16FML); - if (hwcap & HWCAP_FPHP) { - setCPUFeature(FEAT_FP16); - setCPUFeature(FEAT_FP); - } - if (hwcap & HWCAP_DIT) - setCPUFeature(FEAT_DIT); - if (hwcap & HWCAP_ASIMDRDM) - setCPUFeature(FEAT_RDM); - if (hwcap & HWCAP_ILRCPC) - setCPUFeature(FEAT_RCPC2); - if (hwcap & HWCAP_AES) - setCPUFeature(FEAT_AES); - if (hwcap & HWCAP_SHA1) - setCPUFeature(FEAT_SHA1); - if (hwcap & HWCAP_SHA2) - setCPUFeature(FEAT_SHA2); - if (hwcap & HWCAP_JSCVT) - setCPUFeature(FEAT_JSCVT); - if (hwcap & HWCAP_FCMA) - setCPUFeature(FEAT_FCMA); - if (hwcap & HWCAP_SB) - setCPUFeature(FEAT_SB); - if (hwcap & HWCAP_SSBS) - setCPUFeature(FEAT_SSBS2); - if (hwcap2 & HWCAP2_MTE) { - setCPUFeature(FEAT_MEMTAG); - setCPUFeature(FEAT_MEMTAG2); - } - if (hwcap2 & HWCAP2_MTE3) { - setCPUFeature(FEAT_MEMTAG); - setCPUFeature(FEAT_MEMTAG2); - setCPUFeature(FEAT_MEMTAG3); - } - if (hwcap2 & HWCAP2_SVEAES) - setCPUFeature(FEAT_SVE_AES); - if (hwcap2 & HWCAP2_SVEPMULL) { - setCPUFeature(FEAT_SVE_AES); - setCPUFeature(FEAT_SVE_PMULL128); - } - if (hwcap2 & HWCAP2_SVEBITPERM) - setCPUFeature(FEAT_SVE_BITPERM); - if (hwcap2 & HWCAP2_SVESHA3) - setCPUFeature(FEAT_SVE_SHA3); - if (hwcap2 & HWCAP2_SVESM4) - setCPUFeature(FEAT_SVE_SM4); - if (hwcap2 & HWCAP2_DCPODP) - setCPUFeature(FEAT_DPB2); - if (hwcap & HWCAP_ATOMICS) - setCPUFeature(FEAT_LSE); - if (hwcap2 & HWCAP2_RNG) - setCPUFeature(FEAT_RNG); - if (hwcap2 & HWCAP2_I8MM) - setCPUFeature(FEAT_I8MM); - if (hwcap2 & HWCAP2_EBF16) - setCPUFeature(FEAT_EBF16); - if (hwcap2 & HWCAP2_SVE_EBF16) - setCPUFeature(FEAT_SVE_EBF16); - if (hwcap2 & HWCAP2_DGH) - setCPUFeature(FEAT_DGH); - if (hwcap2 & HWCAP2_FRINT) - setCPUFeature(FEAT_FRINTTS); - if (hwcap2 & HWCAP2_SVEI8MM) - setCPUFeature(FEAT_SVE_I8MM); - if (hwcap2 & HWCAP2_SVEF32MM) - setCPUFeature(FEAT_SVE_F32MM); - if (hwcap2 & HWCAP2_SVEF64MM) - setCPUFeature(FEAT_SVE_F64MM); - if (hwcap2 & HWCAP2_BTI) - setCPUFeature(FEAT_BTI); - if (hwcap2 & HWCAP2_RPRES) - setCPUFeature(FEAT_RPRES); - if (hwcap2 & HWCAP2_WFXT) - setCPUFeature(FEAT_WFXT); - if (hwcap2 & HWCAP2_SME) - setCPUFeature(FEAT_SME); - if (hwcap2 & HWCAP2_SME_I16I64) - setCPUFeature(FEAT_SME_I64); - if (hwcap2 & HWCAP2_SME_F64F64) - setCPUFeature(FEAT_SME_F64); - if (hwcap & HWCAP_CPUID) { - unsigned long ftr; - getCPUFeature(ID_AA64PFR1_EL1, ftr); - // ID_AA64PFR1_EL1.MTE >= 0b0001 - if (extractBits(ftr, 8, 4) >= 0x1) - setCPUFeature(FEAT_MEMTAG); - // ID_AA64PFR1_EL1.SSBS == 0b0001 - if (extractBits(ftr, 4, 4) == 0x1) - setCPUFeature(FEAT_SSBS); - // ID_AA64PFR1_EL1.SME == 0b0010 - if (extractBits(ftr, 24, 4) == 0x2) - setCPUFeature(FEAT_SME2); - getCPUFeature(ID_AA64PFR0_EL1, ftr); - // ID_AA64PFR0_EL1.FP != 0b1111 - if (extractBits(ftr, 16, 4) != 0xF) { - setCPUFeature(FEAT_FP); - // ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP - setCPUFeature(FEAT_SIMD); - } - // ID_AA64PFR0_EL1.SVE != 0b0000 - if (extractBits(ftr, 32, 4) != 0x0) { - // get ID_AA64ZFR0_EL1, that name supported - // if sve enabled only - getCPUFeature(S3_0_C0_C4_4, ftr); - // ID_AA64ZFR0_EL1.SVEver == 0b0000 - if (extractBits(ftr, 0, 4) == 0x0) - setCPUFeature(FEAT_SVE); - // ID_AA64ZFR0_EL1.SVEver == 0b0001 - if (extractBits(ftr, 0, 4) == 0x1) - setCPUFeature(FEAT_SVE2); - // ID_AA64ZFR0_EL1.BF16 != 0b0000 - if (extractBits(ftr, 20, 4) != 0x0) - setCPUFeature(FEAT_SVE_BF16); - } - getCPUFeature(ID_AA64ISAR0_EL1, ftr); - // ID_AA64ISAR0_EL1.SHA3 != 0b0000 - if (extractBits(ftr, 32, 4) != 0x0) - setCPUFeature(FEAT_SHA3); - getCPUFeature(ID_AA64ISAR1_EL1, ftr); - // ID_AA64ISAR1_EL1.DPB >= 0b0001 - if (extractBits(ftr, 0, 4) >= 0x1) - setCPUFeature(FEAT_DPB); - // ID_AA64ISAR1_EL1.LRCPC != 0b0000 - if (extractBits(ftr, 20, 4) != 0x0) - setCPUFeature(FEAT_RCPC); - // ID_AA64ISAR1_EL1.SPECRES == 0b0001 - if (extractBits(ftr, 40, 4) == 0x2) - setCPUFeature(FEAT_PREDRES); - // ID_AA64ISAR1_EL1.BF16 != 0b0000 - if (extractBits(ftr, 44, 4) != 0x0) - setCPUFeature(FEAT_BF16); - // ID_AA64ISAR1_EL1.LS64 >= 0b0001 - if (extractBits(ftr, 60, 4) >= 0x1) - setCPUFeature(FEAT_LS64); - // ID_AA64ISAR1_EL1.LS64 >= 0b0010 - if (extractBits(ftr, 60, 4) >= 0x2) - setCPUFeature(FEAT_LS64_V); - // ID_AA64ISAR1_EL1.LS64 >= 0b0011 - if (extractBits(ftr, 60, 4) >= 0x3) - setCPUFeature(FEAT_LS64_ACCDATA); - } else { - // Set some features in case of no CPUID support - if (hwcap & (HWCAP_FP | HWCAP_FPHP)) { - setCPUFeature(FEAT_FP); - // FP and AdvSIMD fields have the same value - setCPUFeature(FEAT_SIMD); - } - if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP) - setCPUFeature(FEAT_DPB); - if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC) - setCPUFeature(FEAT_RCPC); - if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16) - setCPUFeature(FEAT_BF16); - if (hwcap2 & HWCAP2_SVEBF16) - setCPUFeature(FEAT_SVE_BF16); - if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE) - setCPUFeature(FEAT_SVE2); - if (hwcap & HWCAP_SHA3) - setCPUFeature(FEAT_SHA3); - } -} - -void CONSTRUCTOR_ATTRIBUTE init_cpu_features(void) { - unsigned long hwcap; - unsigned long hwcap2; - // CPU features already initialized. - if (__aarch64_cpu_features.features) - return; - setCPUFeature(FEAT_MAX); -#if defined(__FreeBSD__) - int res = 0; - res = elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); - res |= elf_aux_info(AT_HWCAP2, &hwcap2, sizeof hwcap2); - if (res) - return; -#else -#if defined(__ANDROID__) - // Don't set any CPU features, - // detection could be wrong on Exynos 9810. - IF_EXYNOS9810 return; -#endif // defined(__ANDROID__) - hwcap = getauxval(AT_HWCAP); - hwcap2 = getauxval(AT_HWCAP2); -#endif // defined(__FreeBSD__) - init_cpu_features_resolver(hwcap, hwcap2); -#undef extractBits -#undef getCPUFeature -#undef setCPUFeature -#undef IF_EXYNOS9810 -} -#endif // !defined(DISABLE_AARCH64_FMV) -#endif // defined(__has_include) -#endif // __has_include() -#endif // __has_include() -#endif // defined(__aarch64__) diff --git a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc new file mode 100644 index 0000000..e78bb88 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc @@ -0,0 +1,91 @@ +//===- AArch64CPUFeatures.inc - AArch64 CPU Features enum -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the CPUFeatures enum for AArch64 to facilitate better +// testing of this code between LLVM and compiler-rt, primarily that the files +// are an exact match. +// +// This file has two identical copies. The primary copy lives in LLVM and +// the other one sits in compiler-rt/lib/builtins/cpu_model directory. To make +// changes in this file, first modify the primary copy and copy it over to +// compiler-rt. compiler-rt tests will fail if the two files are not synced up. +// +//===----------------------------------------------------------------------===// + +#ifndef AARCH64_CPU_FEATURS_INC_H +#define AARCH64_CPU_FEATURS_INC_H + +// Function Multi Versioning CPU features. +enum CPUFeatures { + FEAT_RNG, + FEAT_FLAGM, + FEAT_FLAGM2, + FEAT_FP16FML, + FEAT_DOTPROD, + FEAT_SM4, + FEAT_RDM, + FEAT_LSE, + FEAT_FP, + FEAT_SIMD, + FEAT_CRC, + FEAT_SHA1, + FEAT_SHA2, + FEAT_SHA3, + FEAT_AES, + FEAT_PMULL, + FEAT_FP16, + FEAT_DIT, + FEAT_DPB, + FEAT_DPB2, + FEAT_JSCVT, + FEAT_FCMA, + FEAT_RCPC, + FEAT_RCPC2, + FEAT_FRINTTS, + FEAT_DGH, + FEAT_I8MM, + FEAT_BF16, + FEAT_EBF16, + FEAT_RPRES, + FEAT_SVE, + FEAT_SVE_BF16, + FEAT_SVE_EBF16, + FEAT_SVE_I8MM, + FEAT_SVE_F32MM, + FEAT_SVE_F64MM, + FEAT_SVE2, + FEAT_SVE_AES, + FEAT_SVE_PMULL128, + FEAT_SVE_BITPERM, + FEAT_SVE_SHA3, + FEAT_SVE_SM4, + FEAT_SME, + FEAT_MEMTAG, + FEAT_MEMTAG2, + FEAT_MEMTAG3, + FEAT_SB, + FEAT_PREDRES, + FEAT_SSBS, + FEAT_SSBS2, + FEAT_BTI, + FEAT_LS64, + FEAT_LS64_V, + FEAT_LS64_ACCDATA, + FEAT_WFXT, + FEAT_SME_F64, + FEAT_SME_I64, + FEAT_SME2, + FEAT_RCPC3, + FEAT_MOPS, + FEAT_MAX, + FEAT_EXT = 62, // Reserved to indicate presence of additional features field + // in __aarch64_cpu_features + FEAT_INIT // Used as flag of features initialization completion +}; + +#endif diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c new file mode 100644 index 0000000..0dd3977 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c @@ -0,0 +1,84 @@ +//===-- cpu_model/aarch64.c - Support for __cpu_model builtin ----*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is based on LLVM's lib/Support/Host.cpp. +// It implements __aarch64_have_lse_atomics, __aarch64_cpu_features for +// AArch64. +// +//===----------------------------------------------------------------------===// + +#include "aarch64.h" + +#if !defined(__aarch64__) +#error This file is intended only for aarch64-based targets +#endif + +#if __has_include() +#include +#else +typedef struct __ifunc_arg_t { + unsigned long _size; + unsigned long _hwcap; + unsigned long _hwcap2; +} __ifunc_arg_t; +#endif // __has_include() + +// LSE support detection for out-of-line atomics +// using HWCAP and Auxiliary vector +_Bool __aarch64_have_lse_atomics + __attribute__((visibility("hidden"), nocommon)) = false; + +#if defined(__FreeBSD__) +// clang-format off: should not reorder sys/auxv.h alphabetically +#include +// clang-format on +#include "aarch64/hwcap.inc" +#include "aarch64/lse_atomics/freebsd.inc" +#elif defined(__Fuchsia__) +#include "aarch64/hwcap.inc" +#include "aarch64/lse_atomics/fuchsia.inc" +#elif defined(__ANDROID__) +#include "aarch64/hwcap.inc" +#include "aarch64/lse_atomics/android.inc" +#elif defined(__linux__) && __has_include() +#include "aarch64/hwcap.inc" +#include "aarch64/lse_atomics/getauxval.inc" +#else +// When unimplemented, we leave __aarch64_have_lse_atomics initialized to false. +#endif + +#if !defined(DISABLE_AARCH64_FMV) + +// Architecture features used +// in Function Multi Versioning +struct { + unsigned long long features; + // As features grows new fields could be added +} __aarch64_cpu_features __attribute__((visibility("hidden"), nocommon)); + +// The formatter wants to re-order these includes, but doing so is incorrect: +// clang-format off +#if defined(__APPLE__) +#include "aarch64/fmv/apple.inc" +#elif defined(__FreeBSD__) +#include "aarch64/fmv/mrs.inc" +#include "aarch64/fmv/freebsd.inc" +#elif defined(__Fuchsia__) +#include "aarch64/fmv/fuchsia.inc" +#elif defined(__ANDROID__) +#include "aarch64/fmv/mrs.inc" +#include "aarch64/fmv/android.inc" +#elif defined(__linux__) && __has_include() +#include "aarch64/fmv/mrs.inc" +#include "aarch64/fmv/getauxval.inc" +#else +#include "aarch64/fmv/unimplemented.inc" +#endif +// clang-format on + +#endif // !defined(DISABLE_AARCH64_FMV) diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.h b/compiler-rt/lib/builtins/cpu_model/aarch64.h new file mode 100644 index 0000000..f6cbf75 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64.h @@ -0,0 +1,21 @@ +//===-- cpu_model/aarch64.h --------------------------------------------- -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "cpu_model.h" + +#if !defined(__aarch64__) +#error This file is intended only for aarch64-based targets +#endif + +#if !defined(DISABLE_AARCH64_FMV) + +#include "AArch64CPUFeatures.inc" + +void __init_cpu_features(void); + +#endif // !defined(DISABLE_AARCH64_FMV) diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/android.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/android.inc new file mode 100644 index 0000000..a9e3594 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/android.inc @@ -0,0 +1,36 @@ +void __init_cpu_features_resolver(unsigned long hwcap, + const __ifunc_arg_t *arg) { + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + + // ifunc resolvers don't have hwcaps in arguments on Android API lower + // than 30. If so, set feature detection done and keep all CPU features + // unsupported (zeros). To detect this case in runtime we check existence + // of memfd_create function from Standard C library which was introduced in + // Android API 30. + int memfd_create(const char *, unsigned int) __attribute__((weak)); + if (!memfd_create) + return; + + __init_cpu_features_constructor(hwcap, arg); +} + +void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) { + // CPU features already initialized. + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + + // Don't set any CPU features, + // detection could be wrong on Exynos 9810. + if (__isExynos9810()) + return; + + unsigned long hwcap = getauxval(AT_HWCAP); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + + __ifunc_arg_t arg; + arg._size = sizeof(__ifunc_arg_t); + arg._hwcap = hwcap; + arg._hwcap2 = hwcap2; + __init_cpu_features_constructor(hwcap | _IFUNC_ARG_HWCAP, &arg); +} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc new file mode 100644 index 0000000..f069490 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc @@ -0,0 +1,159 @@ +#include +#if TARGET_OS_OSX || TARGET_OS_IPHONE +#include + +#if __has_include() +#include +#define HAS_CPU_CAPABILITIES_PUBLIC_H 1 + +// FB13964283 - A few of these didn't make it into the public SDK yet. +#ifndef CAP_BIT_FEAT_SME +#define CAP_BIT_FEAT_SME 40 +#endif +#ifndef CAP_BIT_FEAT_SME2 +#define CAP_BIT_FEAT_SME2 41 +#endif +#ifndef CAP_BIT_FEAT_SME_F64F64 +#define CAP_BIT_FEAT_SME_F64F64 42 +#endif +#ifndef CAP_BIT_FEAT_SME_I16I64 +#define CAP_BIT_FEAT_SME_I16I64 43 +#endif + +#endif + +static bool isKnownAndSupported(const char *name) { + int32_t val = 0; + size_t size = sizeof(val); + if (sysctlbyname(name, &val, &size, NULL, 0)) + return false; + return val; +} + +static uint64_t deriveImplicitFeatures(uint64_t features) { + // FEAT_SSBS2 implies FEAT_SSBS + if ((1ULL << FEAT_SSBS2) & features) + features |= (1ULL << FEAT_SSBS); + + // FEAT_FP is always enabled + features |= (1ULL << FEAT_FP); + + features |= (1ULL << FEAT_INIT); + + return features; +} + +void __init_cpu_features_resolver(void) { + // On Darwin platforms, this may be called concurrently by multiple threads + // because the resolvers that use it are called lazily at runtime (unlike on + // ELF platforms, where IFuncs are resolved serially at load time). This + // function's effect on __aarch64_cpu_features must be idempotent. + + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + + uint64_t features = 0; + +#ifdef HAS_CPU_CAPABILITIES_PUBLIC_H + uint8_t feats_bitvec[(CAP_BIT_NB + 7) / 8] = {0}; + size_t len = sizeof(feats_bitvec); + // When hw.optional.arm.feats is available (macOS 15.0+, iOS 18.0+), use the + // fast path to get all the feature bits, otherwise fall back to the slow + // ~20-something sysctls path. + if (!sysctlbyname("hw.optional.arm.caps", &feats_bitvec, &len, 0, 0)) { + +#define CHECK_BIT(FROM, TO) \ + do { \ + if (feats_bitvec[FROM / 8] & (1u << ((FROM) & 7))) { \ + features |= (1ULL << TO); \ + } \ + } while (0) + + CHECK_BIT(CAP_BIT_FEAT_FlagM, FEAT_FLAGM); + CHECK_BIT(CAP_BIT_FEAT_FlagM2, FEAT_FLAGM2); + CHECK_BIT(CAP_BIT_FEAT_FHM, FEAT_FP16FML); + CHECK_BIT(CAP_BIT_FEAT_DotProd, FEAT_DOTPROD); + CHECK_BIT(CAP_BIT_FEAT_SHA3, FEAT_SHA3); + CHECK_BIT(CAP_BIT_FEAT_RDM, FEAT_RDM); + CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE); + CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2); + CHECK_BIT(CAP_BIT_FEAT_SHA1, FEAT_SHA1); + CHECK_BIT(CAP_BIT_FEAT_AES, FEAT_AES); + CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL); + CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES); + CHECK_BIT(CAP_BIT_FEAT_SB, FEAT_SB); + CHECK_BIT(CAP_BIT_FEAT_FRINTTS, FEAT_FRINTTS); + CHECK_BIT(CAP_BIT_FEAT_LRCPC, FEAT_RCPC); + CHECK_BIT(CAP_BIT_FEAT_LRCPC2, FEAT_RCPC2); + CHECK_BIT(CAP_BIT_FEAT_FCMA, FEAT_FCMA); + CHECK_BIT(CAP_BIT_FEAT_JSCVT, FEAT_JSCVT); + CHECK_BIT(CAP_BIT_FEAT_DPB, FEAT_DPB); + CHECK_BIT(CAP_BIT_FEAT_DPB2, FEAT_DPB2); + CHECK_BIT(CAP_BIT_FEAT_BF16, FEAT_BF16); + CHECK_BIT(CAP_BIT_FEAT_I8MM, FEAT_I8MM); + CHECK_BIT(CAP_BIT_FEAT_DIT, FEAT_DIT); + CHECK_BIT(CAP_BIT_FEAT_FP16, FEAT_FP16); + CHECK_BIT(CAP_BIT_FEAT_SSBS, FEAT_SSBS2); + CHECK_BIT(CAP_BIT_FEAT_BTI, FEAT_BTI); + CHECK_BIT(CAP_BIT_AdvSIMD, FEAT_SIMD); + CHECK_BIT(CAP_BIT_CRC32, FEAT_CRC); + CHECK_BIT(CAP_BIT_FEAT_SME, FEAT_SME); + CHECK_BIT(CAP_BIT_FEAT_SME2, FEAT_SME2); + CHECK_BIT(CAP_BIT_FEAT_SME_F64F64, FEAT_SME_F64); + CHECK_BIT(CAP_BIT_FEAT_SME_I16I64, FEAT_SME_I64); + + features = deriveImplicitFeatures(features); + + __atomic_store(&__aarch64_cpu_features.features, &features, + __ATOMIC_RELAXED); + return; + } +#endif + + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics + static const struct { + const char *sysctl_name; + enum CPUFeatures feature; + } feature_checks[] = { + {"hw.optional.arm.FEAT_FlagM", FEAT_FLAGM}, + {"hw.optional.arm.FEAT_FlagM2", FEAT_FLAGM2}, + {"hw.optional.arm.FEAT_FHM", FEAT_FP16FML}, + {"hw.optional.arm.FEAT_DotProd", FEAT_DOTPROD}, + {"hw.optional.arm.FEAT_RDM", FEAT_RDM}, + {"hw.optional.arm.FEAT_LSE", FEAT_LSE}, + {"hw.optional.AdvSIMD", FEAT_SIMD}, + {"hw.optional.armv8_crc32", FEAT_CRC}, + {"hw.optional.arm.FEAT_SHA1", FEAT_SHA1}, + {"hw.optional.arm.FEAT_SHA256", FEAT_SHA2}, + {"hw.optional.arm.FEAT_SHA3", FEAT_SHA3}, + {"hw.optional.arm.FEAT_AES", FEAT_AES}, + {"hw.optional.arm.FEAT_PMULL", FEAT_PMULL}, + {"hw.optional.arm.FEAT_FP16", FEAT_FP16}, + {"hw.optional.arm.FEAT_DIT", FEAT_DIT}, + {"hw.optional.arm.FEAT_DPB", FEAT_DPB}, + {"hw.optional.arm.FEAT_DPB2", FEAT_DPB2}, + {"hw.optional.arm.FEAT_JSCVT", FEAT_JSCVT}, + {"hw.optional.arm.FEAT_FCMA", FEAT_FCMA}, + {"hw.optional.arm.FEAT_LRCPC", FEAT_RCPC}, + {"hw.optional.arm.FEAT_LRCPC2", FEAT_RCPC2}, + {"hw.optional.arm.FEAT_FRINTTS", FEAT_FRINTTS}, + {"hw.optional.arm.FEAT_I8MM", FEAT_I8MM}, + {"hw.optional.arm.FEAT_BF16", FEAT_BF16}, + {"hw.optional.arm.FEAT_SB", FEAT_SB}, + {"hw.optional.arm.FEAT_SPECRES", FEAT_PREDRES}, + {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS2}, + {"hw.optional.arm.FEAT_BTI", FEAT_BTI}, + }; + + for (size_t I = 0, E = sizeof(feature_checks) / sizeof(feature_checks[0]); + I != E; ++I) + if (isKnownAndSupported(feature_checks[I].sysctl_name)) + features |= (1ULL << feature_checks[I].feature); + + features = deriveImplicitFeatures(features); + + __atomic_store(&__aarch64_cpu_features.features, &features, + __ATOMIC_RELAXED); +} + +#endif // TARGET_OS_OSX || TARGET_OS_IPHONE diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/freebsd.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/freebsd.inc new file mode 100644 index 0000000..aa975dc --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/freebsd.inc @@ -0,0 +1,27 @@ +void __init_cpu_features_resolver(unsigned long hwcap, + const __ifunc_arg_t *arg) { + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + + __init_cpu_features_constructor(hwcap, arg); +} + +void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) { + unsigned long hwcap = 0; + unsigned long hwcap2 = 0; + // CPU features already initialized. + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + + int res = 0; + res = elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); + res |= elf_aux_info(AT_HWCAP2, &hwcap2, sizeof hwcap2); + if (res) + return; + + __ifunc_arg_t arg; + arg._size = sizeof(__ifunc_arg_t); + arg._hwcap = hwcap; + arg._hwcap2 = hwcap2; + __init_cpu_features_constructor(hwcap | _IFUNC_ARG_HWCAP, &arg); +} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc new file mode 100644 index 0000000..1ae4780 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc @@ -0,0 +1,53 @@ +#include +#include + +void __init_cpu_features_resolver() { + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + + // This ensures the vDSO is a direct link-time dependency of anything that + // needs this initializer code. +#pragma comment(lib, "zircon") + uint32_t features; + zx_status_t status = _zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); + if (status != ZX_OK) + return; + + unsigned long long feat = 0; +#define setCPUFeature(cpu_feature) feat |= 1ULL << cpu_feature + + if (features & ZX_ARM64_FEATURE_ISA_FP) + setCPUFeature(FEAT_FP); + if (features & ZX_ARM64_FEATURE_ISA_ASIMD) + setCPUFeature(FEAT_SIMD); + if (features & ZX_ARM64_FEATURE_ISA_AES) + setCPUFeature(FEAT_AES); + if (features & ZX_ARM64_FEATURE_ISA_PMULL) + setCPUFeature(FEAT_PMULL); + if (features & ZX_ARM64_FEATURE_ISA_SHA1) + setCPUFeature(FEAT_SHA1); + if (features & ZX_ARM64_FEATURE_ISA_SHA256) + setCPUFeature(FEAT_SHA2); + if (features & ZX_ARM64_FEATURE_ISA_CRC32) + setCPUFeature(FEAT_CRC); + if (features & ZX_ARM64_FEATURE_ISA_RDM) + setCPUFeature(FEAT_RDM); + if (features & ZX_ARM64_FEATURE_ISA_SHA3) + setCPUFeature(FEAT_SHA3); + if (features & ZX_ARM64_FEATURE_ISA_SM4) + setCPUFeature(FEAT_SM4); + if (features & ZX_ARM64_FEATURE_ISA_DP) + setCPUFeature(FEAT_DOTPROD); + if (features & ZX_ARM64_FEATURE_ISA_FHM) + setCPUFeature(FEAT_FP16FML); + if (features & ZX_ARM64_FEATURE_ISA_SHA512) + setCPUFeature(FEAT_SHA3); + if (features & ZX_ARM64_FEATURE_ISA_I8MM) + setCPUFeature(FEAT_I8MM); + if (features & ZX_ARM64_FEATURE_ISA_SVE) + setCPUFeature(FEAT_SVE); + + setCPUFeature(FEAT_INIT); + + __atomic_store_n(&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED); +} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/getauxval.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/getauxval.inc new file mode 100644 index 0000000..486f77a --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/getauxval.inc @@ -0,0 +1,21 @@ +void __init_cpu_features_resolver(unsigned long hwcap, + const __ifunc_arg_t *arg) { + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + __init_cpu_features_constructor(hwcap, arg); +} + +void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) { + // CPU features already initialized. + if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) + return; + + unsigned long hwcap = getauxval(AT_HWCAP); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + + __ifunc_arg_t arg; + arg._size = sizeof(__ifunc_arg_t); + arg._hwcap = hwcap; + arg._hwcap2 = hwcap2; + __init_cpu_features_constructor(hwcap | _IFUNC_ARG_HWCAP, &arg); +} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc new file mode 100644 index 0000000..e4d5e7f --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc @@ -0,0 +1,149 @@ +#if __has_include() +#include +#define HAVE_SYS_AUXV_H +#endif + +static void __init_cpu_features_constructor(unsigned long hwcap, + const __ifunc_arg_t *arg) { + unsigned long long feat = 0; +#define setCPUFeature(F) feat |= 1ULL << F +#define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr)) +#define extractBits(val, start, number) \ + (val & ((1ULL << number) - 1ULL) << start) >> start + unsigned long hwcap2 = 0; + if (hwcap & _IFUNC_ARG_HWCAP) + hwcap2 = arg->_hwcap2; + if (hwcap & HWCAP_CRC32) + setCPUFeature(FEAT_CRC); + if (hwcap & HWCAP_PMULL) + setCPUFeature(FEAT_PMULL); + if (hwcap & HWCAP_FLAGM) + setCPUFeature(FEAT_FLAGM); + if (hwcap2 & HWCAP2_FLAGM2) + setCPUFeature(FEAT_FLAGM2); + if (hwcap & HWCAP_SM4) + setCPUFeature(FEAT_SM4); + if (hwcap & HWCAP_ASIMDDP) + setCPUFeature(FEAT_DOTPROD); + if (hwcap & HWCAP_ASIMDFHM) + setCPUFeature(FEAT_FP16FML); + if (hwcap & HWCAP_FPHP) + setCPUFeature(FEAT_FP16); + if (hwcap & HWCAP_DIT) + setCPUFeature(FEAT_DIT); + if (hwcap & HWCAP_ASIMDRDM) + setCPUFeature(FEAT_RDM); + if (hwcap & HWCAP_AES) + setCPUFeature(FEAT_AES); + if (hwcap & HWCAP_SHA1) + setCPUFeature(FEAT_SHA1); + if (hwcap & HWCAP_SHA2) + setCPUFeature(FEAT_SHA2); + if (hwcap & HWCAP_JSCVT) + setCPUFeature(FEAT_JSCVT); + if (hwcap & HWCAP_FCMA) + setCPUFeature(FEAT_FCMA); + if (hwcap & HWCAP_SB) + setCPUFeature(FEAT_SB); + if (hwcap & HWCAP_SSBS) { + setCPUFeature(FEAT_SSBS); + setCPUFeature(FEAT_SSBS2); + } + if (hwcap2 & HWCAP2_MTE) { + setCPUFeature(FEAT_MEMTAG); + setCPUFeature(FEAT_MEMTAG2); + } + if (hwcap2 & HWCAP2_MTE3) + setCPUFeature(FEAT_MEMTAG3); + if (hwcap2 & HWCAP2_SVEAES) + setCPUFeature(FEAT_SVE_AES); + if (hwcap2 & HWCAP2_SVEPMULL) + setCPUFeature(FEAT_SVE_PMULL128); + if (hwcap2 & HWCAP2_SVEBITPERM) + setCPUFeature(FEAT_SVE_BITPERM); + if (hwcap2 & HWCAP2_SVESHA3) + setCPUFeature(FEAT_SVE_SHA3); + if (hwcap2 & HWCAP2_SVESM4) + setCPUFeature(FEAT_SVE_SM4); + if (hwcap2 & HWCAP2_DCPODP) + setCPUFeature(FEAT_DPB2); + if (hwcap & HWCAP_ATOMICS) + setCPUFeature(FEAT_LSE); + if (hwcap2 & HWCAP2_RNG) + setCPUFeature(FEAT_RNG); + if (hwcap2 & HWCAP2_I8MM) + setCPUFeature(FEAT_I8MM); + if (hwcap2 & HWCAP2_EBF16) + setCPUFeature(FEAT_EBF16); + if (hwcap2 & HWCAP2_SVE_EBF16) + setCPUFeature(FEAT_SVE_EBF16); + if (hwcap2 & HWCAP2_DGH) + setCPUFeature(FEAT_DGH); + if (hwcap2 & HWCAP2_FRINT) + setCPUFeature(FEAT_FRINTTS); + if (hwcap2 & HWCAP2_SVEI8MM) + setCPUFeature(FEAT_SVE_I8MM); + if (hwcap2 & HWCAP2_SVEF32MM) + setCPUFeature(FEAT_SVE_F32MM); + if (hwcap2 & HWCAP2_SVEF64MM) + setCPUFeature(FEAT_SVE_F64MM); + if (hwcap2 & HWCAP2_BTI) + setCPUFeature(FEAT_BTI); + if (hwcap2 & HWCAP2_RPRES) + setCPUFeature(FEAT_RPRES); + if (hwcap2 & HWCAP2_WFXT) + setCPUFeature(FEAT_WFXT); + if (hwcap2 & HWCAP2_SME) + setCPUFeature(FEAT_SME); + if (hwcap2 & HWCAP2_SME2) + setCPUFeature(FEAT_SME2); + if (hwcap2 & HWCAP2_SME_I16I64) + setCPUFeature(FEAT_SME_I64); + if (hwcap2 & HWCAP2_SME_F64F64) + setCPUFeature(FEAT_SME_F64); + if (hwcap2 & HWCAP2_MOPS) + setCPUFeature(FEAT_MOPS); + if (hwcap & HWCAP_CPUID) { + unsigned long ftr; + + getCPUFeature(ID_AA64ISAR1_EL1, ftr); + /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001 */ + if (extractBits(ftr, 40, 4) >= 0x1) + setCPUFeature(FEAT_PREDRES); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0001 */ + if (extractBits(ftr, 60, 4) >= 0x1) + setCPUFeature(FEAT_LS64); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0010 */ + if (extractBits(ftr, 60, 4) >= 0x2) + setCPUFeature(FEAT_LS64_V); + /* ID_AA64ISAR1_EL1.LS64 >= 0b0011 */ + if (extractBits(ftr, 60, 4) >= 0x3) + setCPUFeature(FEAT_LS64_ACCDATA); + } + if (hwcap & HWCAP_FP) { + setCPUFeature(FEAT_FP); + // FP and AdvSIMD fields have the same value + setCPUFeature(FEAT_SIMD); + } + if (hwcap & HWCAP_DCPOP) + setCPUFeature(FEAT_DPB); + if (hwcap & HWCAP_LRCPC) + setCPUFeature(FEAT_RCPC); + if (hwcap & HWCAP_ILRCPC) + setCPUFeature(FEAT_RCPC2); + if (hwcap2 & HWCAP2_LRCPC3) + setCPUFeature(FEAT_RCPC3); + if (hwcap2 & HWCAP2_BF16) + setCPUFeature(FEAT_BF16); + if (hwcap2 & HWCAP2_SVEBF16) + setCPUFeature(FEAT_SVE_BF16); + if (hwcap & HWCAP_SVE) + setCPUFeature(FEAT_SVE); + if (hwcap2 & HWCAP2_SVE2) + setCPUFeature(FEAT_SVE2); + if (hwcap & HWCAP_SHA3) + setCPUFeature(FEAT_SHA3); + setCPUFeature(FEAT_INIT); + + __atomic_store_n(&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED); +} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/unimplemented.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/unimplemented.inc new file mode 100644 index 0000000..dc34624 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/unimplemented.inc @@ -0,0 +1,8 @@ +// On platforms that have not implemented this yet, we provide an implementation +// that does not claim support for any features by leaving +// __aarch64_cpu_features.features initialized to 0. + +void __init_cpu_features_resolver(unsigned long hwcap, + const __ifunc_arg_t *arg) {} + +void __init_cpu_features(void) {} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc new file mode 100644 index 0000000..41aba82 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc @@ -0,0 +1,189 @@ +#if __has_include() +#include +#define HAVE_SYS_HWCAP_H +#endif + +#ifndef _IFUNC_ARG_HWCAP +#define _IFUNC_ARG_HWCAP (1ULL << 62) +#endif +#ifndef AT_HWCAP +#define AT_HWCAP 16 +#endif +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1 << 11) +#endif +#ifndef HWCAP_FP +#define HWCAP_FP (1 << 0) +#endif +#ifndef HWCAP_ASIMD +#define HWCAP_ASIMD (1 << 1) +#endif +#ifndef HWCAP_AES +#define HWCAP_AES (1 << 3) +#endif +#ifndef HWCAP_PMULL +#define HWCAP_PMULL (1 << 4) +#endif +#ifndef HWCAP_SHA1 +#define HWCAP_SHA1 (1 << 5) +#endif +#ifndef HWCAP_SHA2 +#define HWCAP_SHA2 (1 << 6) +#endif +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif +#ifndef HWCAP_ATOMICS +#define HWCAP_ATOMICS (1 << 8) +#endif +#ifndef HWCAP_FPHP +#define HWCAP_FPHP (1 << 9) +#endif +#ifndef HWCAP_ASIMDHP +#define HWCAP_ASIMDHP (1 << 10) +#endif +#ifndef HWCAP_ASIMDRDM +#define HWCAP_ASIMDRDM (1 << 12) +#endif +#ifndef HWCAP_JSCVT +#define HWCAP_JSCVT (1 << 13) +#endif +#ifndef HWCAP_FCMA +#define HWCAP_FCMA (1 << 14) +#endif +#ifndef HWCAP_LRCPC +#define HWCAP_LRCPC (1 << 15) +#endif +#ifndef HWCAP_DCPOP +#define HWCAP_DCPOP (1 << 16) +#endif +#ifndef HWCAP_SHA3 +#define HWCAP_SHA3 (1 << 17) +#endif +#ifndef HWCAP_SM3 +#define HWCAP_SM3 (1 << 18) +#endif +#ifndef HWCAP_SM4 +#define HWCAP_SM4 (1 << 19) +#endif +#ifndef HWCAP_ASIMDDP +#define HWCAP_ASIMDDP (1 << 20) +#endif +#ifndef HWCAP_SHA512 +#define HWCAP_SHA512 (1 << 21) +#endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif +#ifndef HWCAP_ASIMDFHM +#define HWCAP_ASIMDFHM (1 << 23) +#endif +#ifndef HWCAP_DIT +#define HWCAP_DIT (1 << 24) +#endif +#ifndef HWCAP_ILRCPC +#define HWCAP_ILRCPC (1 << 26) +#endif +#ifndef HWCAP_FLAGM +#define HWCAP_FLAGM (1 << 27) +#endif +#ifndef HWCAP_SSBS +#define HWCAP_SSBS (1 << 28) +#endif +#ifndef HWCAP_SB +#define HWCAP_SB (1 << 29) +#endif + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif +#ifndef HWCAP2_DCPODP +#define HWCAP2_DCPODP (1 << 0) +#endif +#ifndef HWCAP2_SVE2 +#define HWCAP2_SVE2 (1 << 1) +#endif +#ifndef HWCAP2_SVEAES +#define HWCAP2_SVEAES (1 << 2) +#endif +#ifndef HWCAP2_SVEPMULL +#define HWCAP2_SVEPMULL (1 << 3) +#endif +#ifndef HWCAP2_SVEBITPERM +#define HWCAP2_SVEBITPERM (1 << 4) +#endif +#ifndef HWCAP2_SVESHA3 +#define HWCAP2_SVESHA3 (1 << 5) +#endif +#ifndef HWCAP2_SVESM4 +#define HWCAP2_SVESM4 (1 << 6) +#endif +#ifndef HWCAP2_FLAGM2 +#define HWCAP2_FLAGM2 (1 << 7) +#endif +#ifndef HWCAP2_FRINT +#define HWCAP2_FRINT (1 << 8) +#endif +#ifndef HWCAP2_SVEI8MM +#define HWCAP2_SVEI8MM (1 << 9) +#endif +#ifndef HWCAP2_SVEF32MM +#define HWCAP2_SVEF32MM (1 << 10) +#endif +#ifndef HWCAP2_SVEF64MM +#define HWCAP2_SVEF64MM (1 << 11) +#endif +#ifndef HWCAP2_SVEBF16 +#define HWCAP2_SVEBF16 (1 << 12) +#endif +#ifndef HWCAP2_I8MM +#define HWCAP2_I8MM (1 << 13) +#endif +#ifndef HWCAP2_BF16 +#define HWCAP2_BF16 (1 << 14) +#endif +#ifndef HWCAP2_DGH +#define HWCAP2_DGH (1 << 15) +#endif +#ifndef HWCAP2_RNG +#define HWCAP2_RNG (1 << 16) +#endif +#ifndef HWCAP2_BTI +#define HWCAP2_BTI (1 << 17) +#endif +#ifndef HWCAP2_MTE +#define HWCAP2_MTE (1 << 18) +#endif +#ifndef HWCAP2_RPRES +#define HWCAP2_RPRES (1 << 21) +#endif +#ifndef HWCAP2_MTE3 +#define HWCAP2_MTE3 (1 << 22) +#endif +#ifndef HWCAP2_SME +#define HWCAP2_SME (1 << 23) +#endif +#ifndef HWCAP2_SME_I16I64 +#define HWCAP2_SME_I16I64 (1 << 24) +#endif +#ifndef HWCAP2_SME_F64F64 +#define HWCAP2_SME_F64F64 (1 << 25) +#endif +#ifndef HWCAP2_WFXT +#define HWCAP2_WFXT (1UL << 31) +#endif +#ifndef HWCAP2_EBF16 +#define HWCAP2_EBF16 (1ULL << 32) +#endif +#ifndef HWCAP2_SVE_EBF16 +#define HWCAP2_SVE_EBF16 (1ULL << 33) +#endif +#ifndef HWCAP2_SME2 +#define HWCAP2_SME2 (1UL << 37) +#endif +#ifndef HWCAP2_MOPS +#define HWCAP2_MOPS (1ULL << 43) +#endif +#ifndef HWCAP2_LRCPC3 +#define HWCAP2_LRCPC3 (1UL << 46) +#endif diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc new file mode 100644 index 0000000..94bf64a --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc @@ -0,0 +1,28 @@ +#include +#include +#include + +static bool __isExynos9810(void) { + char arch[PROP_VALUE_MAX]; + return __system_property_get("ro.arch", arch) > 0 && + strncmp(arch, "exynos9810", sizeof("exynos9810") - 1) == 0; +} + +static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) { + unsigned long hwcap = getauxval(AT_HWCAP); + _Bool result = (hwcap & HWCAP_ATOMICS) != 0; + if (result) { + // Some cores in the Exynos 9810 CPU are ARMv8.2 and others are ARMv8.0; + // only the former support LSE atomics. However, the kernel in the + // initial Android 8.0 release of Galaxy S9/S9+ devices incorrectly + // reported the feature as being supported. + // + // The kernel appears to have been corrected to mark it unsupported as of + // the Android 9.0 release on those devices, and this issue has not been + // observed anywhere else. Thus, this workaround may be removed if + // compiler-rt ever drops support for Android 8.0. + if (__isExynos9810()) + result = false; + } + __aarch64_have_lse_atomics = result; +} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/freebsd.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/freebsd.inc new file mode 100644 index 0000000..4a1f9c2 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/freebsd.inc @@ -0,0 +1,5 @@ +static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) { + unsigned long hwcap; + int result = elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); + __aarch64_have_lse_atomics = result == 0 && (hwcap & HWCAP_ATOMICS) != 0; +} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/fuchsia.inc new file mode 100644 index 0000000..91eac70 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/fuchsia.inc @@ -0,0 +1,12 @@ +#include +#include + +static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) { + // This ensures the vDSO is a direct link-time dependency of anything that + // needs this initializer code. +#pragma comment(lib, "zircon") + uint32_t features; + zx_status_t status = _zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); + __aarch64_have_lse_atomics = + status == ZX_OK && (features & ZX_ARM64_FEATURE_ISA_ATOMICS) != 0; +} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc new file mode 100644 index 0000000..6642c1f --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc @@ -0,0 +1,6 @@ +#include + +static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) { + unsigned long hwcap = getauxval(AT_HWCAP); + __aarch64_have_lse_atomics = (hwcap & HWCAP_ATOMICS) != 0; +} diff --git a/compiler-rt/lib/builtins/cpu_model/cpu_model.h b/compiler-rt/lib/builtins/cpu_model/cpu_model.h new file mode 100644 index 0000000..924ca89 --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/cpu_model.h @@ -0,0 +1,41 @@ +//===-- cpu_model_common.c - Utilities for cpu model detection ----*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements common utilities for runtime cpu model detection. +// +//===----------------------------------------------------------------------===// + +#ifndef COMPILER_RT_LIB_BUILTINS_CPU_MODEL_COMMON_H +#define COMPILER_RT_LIB_BUILTINS_CPU_MODEL_COMMON_H + +#define bool int +#define true 1 +#define false 0 + +#ifndef __has_attribute +#define __has_attribute(attr) 0 +#endif + +#if __has_attribute(constructor) +#if __GNUC__ >= 9 +// Ordinarily init priorities below 101 are disallowed as they are reserved for +// the implementation. However, we are the implementation, so silence the +// diagnostic, since it doesn't apply to us. +#pragma GCC diagnostic ignored "-Wprio-ctor-dtor" +#endif +// We're choosing init priority 90 to force our constructors to run before any +// constructors in the end user application (starting at priority 101). This +// value matches the libgcc choice for the same functions. +#define CONSTRUCTOR_ATTRIBUTE __attribute__((constructor(90))) +#else +// FIXME: For MSVC, we should make a function pointer global in .CRT$X?? so that +// this runs during initialization. +#define CONSTRUCTOR_ATTRIBUTE +#endif + +#endif diff --git a/compiler-rt/lib/builtins/cpu_model/riscv.c b/compiler-rt/lib/builtins/cpu_model/riscv.c new file mode 100644 index 0000000..987812c --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/riscv.c @@ -0,0 +1,370 @@ +//=== cpu_model/riscv.c - Update RISC-V Feature Bits Structure -*- C -*-======// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "cpu_model.h" + +#define RISCV_FEATURE_BITS_LENGTH 2 +struct { + unsigned length; + unsigned long long features[RISCV_FEATURE_BITS_LENGTH]; +} __riscv_feature_bits __attribute__((visibility("hidden"), nocommon)); + +#define RISCV_VENDOR_FEATURE_BITS_LENGTH 1 +struct { + unsigned length; + unsigned long long features[RISCV_VENDOR_FEATURE_BITS_LENGTH]; +} __riscv_vendor_feature_bits __attribute__((visibility("hidden"), nocommon)); + +struct { + unsigned mVendorID; + unsigned mArchID; + unsigned mImplID; +} __riscv_cpu_model __attribute__((visibility("hidden"), nocommon)); + +// NOTE: Should sync-up with RISCVFeatures.td +// TODO: Maybe generate a header from tablegen then include it. +#define A_GROUPID 0 +#define A_BITMASK (1ULL << 0) +#define C_GROUPID 0 +#define C_BITMASK (1ULL << 2) +#define D_GROUPID 0 +#define D_BITMASK (1ULL << 3) +#define F_GROUPID 0 +#define F_BITMASK (1ULL << 5) +#define I_GROUPID 0 +#define I_BITMASK (1ULL << 8) +#define M_GROUPID 0 +#define M_BITMASK (1ULL << 12) +#define V_GROUPID 0 +#define V_BITMASK (1ULL << 21) +#define ZACAS_GROUPID 0 +#define ZACAS_BITMASK (1ULL << 26) +#define ZBA_GROUPID 0 +#define ZBA_BITMASK (1ULL << 27) +#define ZBB_GROUPID 0 +#define ZBB_BITMASK (1ULL << 28) +#define ZBC_GROUPID 0 +#define ZBC_BITMASK (1ULL << 29) +#define ZBKB_GROUPID 0 +#define ZBKB_BITMASK (1ULL << 30) +#define ZBKC_GROUPID 0 +#define ZBKC_BITMASK (1ULL << 31) +#define ZBKX_GROUPID 0 +#define ZBKX_BITMASK (1ULL << 32) +#define ZBS_GROUPID 0 +#define ZBS_BITMASK (1ULL << 33) +#define ZFA_GROUPID 0 +#define ZFA_BITMASK (1ULL << 34) +#define ZFH_GROUPID 0 +#define ZFH_BITMASK (1ULL << 35) +#define ZFHMIN_GROUPID 0 +#define ZFHMIN_BITMASK (1ULL << 36) +#define ZICBOZ_GROUPID 0 +#define ZICBOZ_BITMASK (1ULL << 37) +#define ZICOND_GROUPID 0 +#define ZICOND_BITMASK (1ULL << 38) +#define ZIHINTNTL_GROUPID 0 +#define ZIHINTNTL_BITMASK (1ULL << 39) +#define ZIHINTPAUSE_GROUPID 0 +#define ZIHINTPAUSE_BITMASK (1ULL << 40) +#define ZKND_GROUPID 0 +#define ZKND_BITMASK (1ULL << 41) +#define ZKNE_GROUPID 0 +#define ZKNE_BITMASK (1ULL << 42) +#define ZKNH_GROUPID 0 +#define ZKNH_BITMASK (1ULL << 43) +#define ZKSED_GROUPID 0 +#define ZKSED_BITMASK (1ULL << 44) +#define ZKSH_GROUPID 0 +#define ZKSH_BITMASK (1ULL << 45) +#define ZKT_GROUPID 0 +#define ZKT_BITMASK (1ULL << 46) +#define ZTSO_GROUPID 0 +#define ZTSO_BITMASK (1ULL << 47) +#define ZVBB_GROUPID 0 +#define ZVBB_BITMASK (1ULL << 48) +#define ZVBC_GROUPID 0 +#define ZVBC_BITMASK (1ULL << 49) +#define ZVFH_GROUPID 0 +#define ZVFH_BITMASK (1ULL << 50) +#define ZVFHMIN_GROUPID 0 +#define ZVFHMIN_BITMASK (1ULL << 51) +#define ZVKB_GROUPID 0 +#define ZVKB_BITMASK (1ULL << 52) +#define ZVKG_GROUPID 0 +#define ZVKG_BITMASK (1ULL << 53) +#define ZVKNED_GROUPID 0 +#define ZVKNED_BITMASK (1ULL << 54) +#define ZVKNHA_GROUPID 0 +#define ZVKNHA_BITMASK (1ULL << 55) +#define ZVKNHB_GROUPID 0 +#define ZVKNHB_BITMASK (1ULL << 56) +#define ZVKSED_GROUPID 0 +#define ZVKSED_BITMASK (1ULL << 57) +#define ZVKSH_GROUPID 0 +#define ZVKSH_BITMASK (1ULL << 58) +#define ZVKT_GROUPID 0 +#define ZVKT_BITMASK (1ULL << 59) +#define ZVE32X_GROUPID 0 +#define ZVE32X_BITMASK (1ULL << 60) +#define ZVE32F_GROUPID 0 +#define ZVE32F_BITMASK (1ULL << 61) +#define ZVE64X_GROUPID 0 +#define ZVE64X_BITMASK (1ULL << 62) +#define ZVE64F_GROUPID 0 +#define ZVE64F_BITMASK (1ULL << 63) +#define ZVE64D_GROUPID 1 +#define ZVE64D_BITMASK (1ULL << 0) +#define ZIMOP_GROUPID 1 +#define ZIMOP_BITMASK (1ULL << 1) +#define ZCA_GROUPID 1 +#define ZCA_BITMASK (1ULL << 2) +#define ZCB_GROUPID 1 +#define ZCB_BITMASK (1ULL << 3) +#define ZCD_GROUPID 1 +#define ZCD_BITMASK (1ULL << 4) +#define ZCF_GROUPID 1 +#define ZCF_BITMASK (1ULL << 5) +#define ZCMOP_GROUPID 1 +#define ZCMOP_BITMASK (1ULL << 6) +#define ZAWRS_GROUPID 1 +#define ZAWRS_BITMASK (1ULL << 7) + +#if defined(__linux__) + +// The RISC-V hwprobe interface is documented here: +// . + +static long syscall_impl_5_args(long number, long arg1, long arg2, long arg3, + long arg4, long arg5) { + register long a7 __asm__("a7") = number; + register long a0 __asm__("a0") = arg1; + register long a1 __asm__("a1") = arg2; + register long a2 __asm__("a2") = arg3; + register long a3 __asm__("a3") = arg4; + register long a4 __asm__("a4") = arg5; + __asm__ __volatile__("ecall\n\t" + : "=r"(a0) + : "r"(a7), "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4) + : "memory"); + return a0; +} + +#define RISCV_HWPROBE_KEY_MVENDORID 0 +#define RISCV_HWPROBE_KEY_MARCHID 1 +#define RISCV_HWPROBE_KEY_MIMPID 2 +#define RISCV_HWPROBE_KEY_BASE_BEHAVIOR 3 +#define RISCV_HWPROBE_BASE_BEHAVIOR_IMA (1ULL << 0) +#define RISCV_HWPROBE_KEY_IMA_EXT_0 4 +#define RISCV_HWPROBE_IMA_FD (1ULL << 0) +#define RISCV_HWPROBE_IMA_C (1ULL << 1) +#define RISCV_HWPROBE_IMA_V (1ULL << 2) +#define RISCV_HWPROBE_EXT_ZBA (1ULL << 3) +#define RISCV_HWPROBE_EXT_ZBB (1ULL << 4) +#define RISCV_HWPROBE_EXT_ZBS (1ULL << 5) +#define RISCV_HWPROBE_EXT_ZICBOZ (1ULL << 6) +#define RISCV_HWPROBE_EXT_ZBC (1ULL << 7) +#define RISCV_HWPROBE_EXT_ZBKB (1ULL << 8) +#define RISCV_HWPROBE_EXT_ZBKC (1ULL << 9) +#define RISCV_HWPROBE_EXT_ZBKX (1ULL << 10) +#define RISCV_HWPROBE_EXT_ZKND (1ULL << 11) +#define RISCV_HWPROBE_EXT_ZKNE (1ULL << 12) +#define RISCV_HWPROBE_EXT_ZKNH (1ULL << 13) +#define RISCV_HWPROBE_EXT_ZKSED (1ULL << 14) +#define RISCV_HWPROBE_EXT_ZKSH (1ULL << 15) +#define RISCV_HWPROBE_EXT_ZKT (1ULL << 16) +#define RISCV_HWPROBE_EXT_ZVBB (1ULL << 17) +#define RISCV_HWPROBE_EXT_ZVBC (1ULL << 18) +#define RISCV_HWPROBE_EXT_ZVKB (1ULL << 19) +#define RISCV_HWPROBE_EXT_ZVKG (1ULL << 20) +#define RISCV_HWPROBE_EXT_ZVKNED (1ULL << 21) +#define RISCV_HWPROBE_EXT_ZVKNHA (1ULL << 22) +#define RISCV_HWPROBE_EXT_ZVKNHB (1ULL << 23) +#define RISCV_HWPROBE_EXT_ZVKSED (1ULL << 24) +#define RISCV_HWPROBE_EXT_ZVKSH (1ULL << 25) +#define RISCV_HWPROBE_EXT_ZVKT (1ULL << 26) +#define RISCV_HWPROBE_EXT_ZFH (1ULL << 27) +#define RISCV_HWPROBE_EXT_ZFHMIN (1ULL << 28) +#define RISCV_HWPROBE_EXT_ZIHINTNTL (1ULL << 29) +#define RISCV_HWPROBE_EXT_ZVFH (1ULL << 30) +#define RISCV_HWPROBE_EXT_ZVFHMIN (1ULL << 31) +#define RISCV_HWPROBE_EXT_ZFA (1ULL << 32) +#define RISCV_HWPROBE_EXT_ZTSO (1ULL << 33) +#define RISCV_HWPROBE_EXT_ZACAS (1ULL << 34) +#define RISCV_HWPROBE_EXT_ZICOND (1ULL << 35) +#define RISCV_HWPROBE_EXT_ZIHINTPAUSE (1ULL << 36) +#define RISCV_HWPROBE_EXT_ZVE32X (1ULL << 37) +#define RISCV_HWPROBE_EXT_ZVE32F (1ULL << 38) +#define RISCV_HWPROBE_EXT_ZVE64X (1ULL << 39) +#define RISCV_HWPROBE_EXT_ZVE64F (1ULL << 40) +#define RISCV_HWPROBE_EXT_ZVE64D (1ULL << 41) +#define RISCV_HWPROBE_EXT_ZIMOP (1ULL << 42) +#define RISCV_HWPROBE_EXT_ZCA (1ULL << 43) +#define RISCV_HWPROBE_EXT_ZCB (1ULL << 44) +#define RISCV_HWPROBE_EXT_ZCD (1ULL << 45) +#define RISCV_HWPROBE_EXT_ZCF (1ULL << 46) +#define RISCV_HWPROBE_EXT_ZCMOP (1ULL << 47) +#define RISCV_HWPROBE_EXT_ZAWRS (1ULL << 48) +#define RISCV_HWPROBE_KEY_CPUPERF_0 5 +#define RISCV_HWPROBE_MISALIGNED_UNKNOWN (0 << 0) +#define RISCV_HWPROBE_MISALIGNED_EMULATED (1ULL << 0) +#define RISCV_HWPROBE_MISALIGNED_SLOW (2 << 0) +#define RISCV_HWPROBE_MISALIGNED_FAST (3 << 0) +#define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0) +#define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) +#define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6 +/* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ + +struct riscv_hwprobe { + long long key; + unsigned long long value; +}; + +#define __NR_riscv_hwprobe 258 +static long initHwProbe(struct riscv_hwprobe *Hwprobes, int len) { + return syscall_impl_5_args(__NR_riscv_hwprobe, (long)Hwprobes, len, 0, 0, 0); +} + +#define SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(EXTNAME) \ + SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_EXT_##EXTNAME, EXTNAME) + +#define SET_SINGLE_IMAEXT_RISCV_FEATURE(HWPROBE_BITMASK, EXT) \ + SET_SINGLE_RISCV_FEATURE(IMAEXT0Value &HWPROBE_BITMASK, EXT) + +#define SET_SINGLE_RISCV_FEATURE(COND, EXT) \ + if (COND) { \ + SET_RISCV_FEATURE(EXT); \ + } + +#define SET_RISCV_FEATURE(EXT) features[EXT##_GROUPID] |= EXT##_BITMASK + +static void initRISCVFeature(struct riscv_hwprobe Hwprobes[]) { + + // Note: If a hwprobe key is unknown to the kernel, its key field + // will be cleared to -1, and its value set to 0. + // This unsets all extension bitmask bits. + + // Init VendorID, ArchID, ImplID + __riscv_cpu_model.mVendorID = Hwprobes[2].value; + __riscv_cpu_model.mArchID = Hwprobes[3].value; + __riscv_cpu_model.mImplID = Hwprobes[4].value; + + // Init standard extension + // TODO: Maybe Extension implied generate from tablegen? + + unsigned long long features[RISCV_FEATURE_BITS_LENGTH]; + int i; + + for (i = 0; i < RISCV_FEATURE_BITS_LENGTH; i++) + features[i] = 0; + + // Check RISCV_HWPROBE_KEY_BASE_BEHAVIOR + unsigned long long BaseValue = Hwprobes[0].value; + if (BaseValue & RISCV_HWPROBE_BASE_BEHAVIOR_IMA) { + SET_RISCV_FEATURE(I); + SET_RISCV_FEATURE(M); + SET_RISCV_FEATURE(A); + } + + // Check RISCV_HWPROBE_KEY_IMA_EXT_0 + unsigned long long IMAEXT0Value = Hwprobes[1].value; + if (IMAEXT0Value & RISCV_HWPROBE_IMA_FD) { + SET_RISCV_FEATURE(F); + SET_RISCV_FEATURE(D); + } + + SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_IMA_C, C); + SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_IMA_V, V); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBA); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBB); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBS); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZICBOZ); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBC); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKB); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKC); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKX); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKND); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKNE); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKNH); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKSED); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKSH); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKT); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVBB); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVBC); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKB); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKG); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNED); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNHA); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNHB); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKSED); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKSH); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKT); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFH); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFHMIN); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZIHINTNTL); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZIHINTPAUSE); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVFH); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVFHMIN); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFA); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZTSO); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZACAS); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZICOND); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE32X); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE32F); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE64X); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE64F); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE64D); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZIMOP); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCA); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCB); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCD); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCF); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCMOP); + SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZAWRS); + + for (i = 0; i < RISCV_FEATURE_BITS_LENGTH; i++) + __riscv_feature_bits.features[i] = features[i]; +} + +#endif // defined(__linux__) + +static int FeaturesBitCached = 0; + +void __init_riscv_feature_bits(void *) CONSTRUCTOR_ATTRIBUTE; + +// A constructor function that sets __riscv_feature_bits, and +// __riscv_vendor_feature_bits to the right values. This needs to run +// only once. This constructor is given the highest priority and it should +// run before constructors without the priority set. However, it still runs +// after ifunc initializers and needs to be called explicitly there. + +// PlatformArgs allows the platform to provide pre-computed data and access it +// without extra effort. For example, Linux could pass the vDSO object to avoid +// an extra system call. +void CONSTRUCTOR_ATTRIBUTE __init_riscv_feature_bits(void *PlatformArgs) { + + if (FeaturesBitCached) + return; + + __riscv_feature_bits.length = RISCV_FEATURE_BITS_LENGTH; + __riscv_vendor_feature_bits.length = RISCV_VENDOR_FEATURE_BITS_LENGTH; + +#if defined(__linux__) + struct riscv_hwprobe Hwprobes[] = { + {RISCV_HWPROBE_KEY_BASE_BEHAVIOR, 0}, {RISCV_HWPROBE_KEY_IMA_EXT_0, 0}, + {RISCV_HWPROBE_KEY_MVENDORID, 0}, {RISCV_HWPROBE_KEY_MARCHID, 0}, + {RISCV_HWPROBE_KEY_MIMPID, 0}, + }; + if (initHwProbe(Hwprobes, sizeof(Hwprobes) / sizeof(Hwprobes[0]))) + return; + + initRISCVFeature(Hwprobes); +#endif // defined(__linux__) + + FeaturesBitCached = 1; +} diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c new file mode 100644 index 0000000..069defc --- /dev/null +++ b/compiler-rt/lib/builtins/cpu_model/x86.c @@ -0,0 +1,1139 @@ +//===-- cpu_model/x86.c - Support for __cpu_model builtin --------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is based on LLVM's lib/Support/Host.cpp. +// It implements the operating system Host concept and builtin +// __cpu_model for the compiler_rt library for x86. +// +//===----------------------------------------------------------------------===// + +#include "cpu_model.h" + +#if !(defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ + defined(_M_X64)) +#error This file is intended only for x86-based targets +#endif + +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) + +#include + +#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER) +#include +#endif + +#ifdef _MSC_VER +#include +#endif + +enum VendorSignatures { + SIG_INTEL = 0x756e6547, // Genu + SIG_AMD = 0x68747541, // Auth +}; + +enum ProcessorVendors { + VENDOR_INTEL = 1, + VENDOR_AMD, + VENDOR_OTHER, + VENDOR_MAX +}; + +enum ProcessorTypes { + INTEL_BONNELL = 1, + INTEL_CORE2, + INTEL_COREI7, + AMDFAM10H, + AMDFAM15H, + INTEL_SILVERMONT, + INTEL_KNL, + AMD_BTVER1, + AMD_BTVER2, + AMDFAM17H, + INTEL_KNM, + INTEL_GOLDMONT, + INTEL_GOLDMONT_PLUS, + INTEL_TREMONT, + AMDFAM19H, + ZHAOXIN_FAM7H, + INTEL_SIERRAFOREST, + INTEL_GRANDRIDGE, + INTEL_CLEARWATERFOREST, + CPU_TYPE_MAX +}; + +enum ProcessorSubtypes { + INTEL_COREI7_NEHALEM = 1, + INTEL_COREI7_WESTMERE, + INTEL_COREI7_SANDYBRIDGE, + AMDFAM10H_BARCELONA, + AMDFAM10H_SHANGHAI, + AMDFAM10H_ISTANBUL, + AMDFAM15H_BDVER1, + AMDFAM15H_BDVER2, + AMDFAM15H_BDVER3, + AMDFAM15H_BDVER4, + AMDFAM17H_ZNVER1, + INTEL_COREI7_IVYBRIDGE, + INTEL_COREI7_HASWELL, + INTEL_COREI7_BROADWELL, + INTEL_COREI7_SKYLAKE, + INTEL_COREI7_SKYLAKE_AVX512, + INTEL_COREI7_CANNONLAKE, + INTEL_COREI7_ICELAKE_CLIENT, + INTEL_COREI7_ICELAKE_SERVER, + AMDFAM17H_ZNVER2, + INTEL_COREI7_CASCADELAKE, + INTEL_COREI7_TIGERLAKE, + INTEL_COREI7_COOPERLAKE, + INTEL_COREI7_SAPPHIRERAPIDS, + INTEL_COREI7_ALDERLAKE, + AMDFAM19H_ZNVER3, + INTEL_COREI7_ROCKETLAKE, + ZHAOXIN_FAM7H_LUJIAZUI, + AMDFAM19H_ZNVER4, + INTEL_COREI7_GRANITERAPIDS, + INTEL_COREI7_GRANITERAPIDS_D, + INTEL_COREI7_ARROWLAKE, + INTEL_COREI7_ARROWLAKE_S, + INTEL_COREI7_PANTHERLAKE, + CPU_SUBTYPE_MAX +}; + +enum ProcessorFeatures { + FEATURE_CMOV = 0, + FEATURE_MMX, + FEATURE_POPCNT, + FEATURE_SSE, + FEATURE_SSE2, + FEATURE_SSE3, + FEATURE_SSSE3, + FEATURE_SSE4_1, + FEATURE_SSE4_2, + FEATURE_AVX, + FEATURE_AVX2, + FEATURE_SSE4_A, + FEATURE_FMA4, + FEATURE_XOP, + FEATURE_FMA, + FEATURE_AVX512F, + FEATURE_BMI, + FEATURE_BMI2, + FEATURE_AES, + FEATURE_PCLMUL, + FEATURE_AVX512VL, + FEATURE_AVX512BW, + FEATURE_AVX512DQ, + FEATURE_AVX512CD, + FEATURE_AVX512ER, + FEATURE_AVX512PF, + FEATURE_AVX512VBMI, + FEATURE_AVX512IFMA, + FEATURE_AVX5124VNNIW, + FEATURE_AVX5124FMAPS, + FEATURE_AVX512VPOPCNTDQ, + FEATURE_AVX512VBMI2, + FEATURE_GFNI, + FEATURE_VPCLMULQDQ, + FEATURE_AVX512VNNI, + FEATURE_AVX512BITALG, + FEATURE_AVX512BF16, + FEATURE_AVX512VP2INTERSECT, + // FIXME: Below Features has some missings comparing to gcc, it's because gcc + // has some not one-to-one mapped in llvm. + // FEATURE_3DNOW, + // FEATURE_3DNOWP, + FEATURE_ADX = 40, + // FEATURE_ABM, + FEATURE_CLDEMOTE = 42, + FEATURE_CLFLUSHOPT, + FEATURE_CLWB, + FEATURE_CLZERO, + FEATURE_CMPXCHG16B, + // FIXME: Not adding FEATURE_CMPXCHG8B is a workaround to make 'generic' as + // a cpu string with no X86_FEATURE_COMPAT features, which is required in + // current implementantion of cpu_specific/cpu_dispatch FMV feature. + // FEATURE_CMPXCHG8B, + FEATURE_ENQCMD = 48, + FEATURE_F16C, + FEATURE_FSGSBASE, + // FEATURE_FXSAVE, + // FEATURE_HLE, + // FEATURE_IBT, + FEATURE_LAHF_LM = 54, + FEATURE_LM, + FEATURE_LWP, + FEATURE_LZCNT, + FEATURE_MOVBE, + FEATURE_MOVDIR64B, + FEATURE_MOVDIRI, + FEATURE_MWAITX, + // FEATURE_OSXSAVE, + FEATURE_PCONFIG = 63, + FEATURE_PKU, + FEATURE_PREFETCHWT1, + FEATURE_PRFCHW, + FEATURE_PTWRITE, + FEATURE_RDPID, + FEATURE_RDRND, + FEATURE_RDSEED, + FEATURE_RTM, + FEATURE_SERIALIZE, + FEATURE_SGX, + FEATURE_SHA, + FEATURE_SHSTK, + FEATURE_TBM, + FEATURE_TSXLDTRK, + FEATURE_VAES, + FEATURE_WAITPKG, + FEATURE_WBNOINVD, + FEATURE_XSAVE, + FEATURE_XSAVEC, + FEATURE_XSAVEOPT, + FEATURE_XSAVES, + FEATURE_AMX_TILE, + FEATURE_AMX_INT8, + FEATURE_AMX_BF16, + FEATURE_UINTR, + FEATURE_HRESET, + FEATURE_KL, + // FEATURE_AESKLE, + FEATURE_WIDEKL = 92, + FEATURE_AVXVNNI, + FEATURE_AVX512FP16, + FEATURE_X86_64_BASELINE, + FEATURE_X86_64_V2, + FEATURE_X86_64_V3, + FEATURE_X86_64_V4, + FEATURE_AVXIFMA, + FEATURE_AVXVNNIINT8, + FEATURE_AVXNECONVERT, + FEATURE_CMPCCXADD, + FEATURE_AMX_FP16, + FEATURE_PREFETCHI, + FEATURE_RAOINT, + FEATURE_AMX_COMPLEX, + FEATURE_AVXVNNIINT16, + FEATURE_SM3, + FEATURE_SHA512, + FEATURE_SM4, + FEATURE_APXF, + FEATURE_USERMSR, + FEATURE_AVX10_1_256, + FEATURE_AVX10_1_512, + FEATURE_AVX10_2_256, + FEATURE_AVX10_2_512, + CPU_FEATURE_MAX +}; + +// This code is copied from lib/Support/Host.cpp. +// Changes to either file should be mirrored in the other. + +/// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in +/// the specified arguments. If we can't run cpuid on the host, return true. +static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX, + unsigned *rECX, unsigned *rEDX) { +#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER) + return !__get_cpuid(value, rEAX, rEBX, rECX, rEDX); +#elif defined(_MSC_VER) + // The MSVC intrinsic is portable across x86 and x64. + int registers[4]; + __cpuid(registers, value); + *rEAX = registers[0]; + *rEBX = registers[1]; + *rECX = registers[2]; + *rEDX = registers[3]; + return false; +#else + return true; +#endif +} + +/// getX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return +/// the 4 values in the specified arguments. If we can't run cpuid on the host, +/// return true. +static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf, + unsigned *rEAX, unsigned *rEBX, unsigned *rECX, + unsigned *rEDX) { + // TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang + // are such that __cpuidex is defined within cpuid.h for both, we can remove + // the __get_cpuid_count function and share the MSVC implementation between + // all three. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER) + return !__get_cpuid_count(value, subleaf, rEAX, rEBX, rECX, rEDX); +#elif defined(_MSC_VER) + int registers[4]; + __cpuidex(registers, value, subleaf); + *rEAX = registers[0]; + *rEBX = registers[1]; + *rECX = registers[2]; + *rEDX = registers[3]; + return false; +#else + return true; +#endif +} + +// Read control register 0 (XCR0). Used to detect features such as AVX. +static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) { + // TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang + // are such that _xgetbv is supported by both, we can unify the implementation + // with MSVC and remove all inline assembly. +#if defined(__GNUC__) || defined(__clang__) + // Check xgetbv; this uses a .byte sequence instead of the instruction + // directly because older assemblers do not include support for xgetbv and + // there is no easy way to conditionally compile based on the assembler used. + __asm__(".byte 0x0f, 0x01, 0xd0" : "=a"(*rEAX), "=d"(*rEDX) : "c"(0)); + return false; +#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) + unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); + *rEAX = Result; + *rEDX = Result >> 32; + return false; +#else + return true; +#endif +} + +static void detectX86FamilyModel(unsigned EAX, unsigned *Family, + unsigned *Model) { + *Family = (EAX >> 8) & 0xf; // Bits 8 - 11 + *Model = (EAX >> 4) & 0xf; // Bits 4 - 7 + if (*Family == 6 || *Family == 0xf) { + if (*Family == 0xf) + // Examine extended family ID if family ID is F. + *Family += (EAX >> 20) & 0xff; // Bits 20 - 27 + // Examine extended model ID if family ID is 6 or F. + *Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19 + } +} + +#define testFeature(F) (Features[F / 32] & (1 << (F % 32))) != 0 + +static const char *getIntelProcessorTypeAndSubtype(unsigned Family, + unsigned Model, + const unsigned *Features, + unsigned *Type, + unsigned *Subtype) { + // We select CPU strings to match the code in Host.cpp, but we don't use them + // in compiler-rt. + const char *CPU = 0; + + switch (Family) { + case 6: + switch (Model) { + case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile + // processor, Intel Core 2 Quad processor, Intel Core 2 Quad + // mobile processor, Intel Core 2 Extreme processor, Intel + // Pentium Dual-Core processor, Intel Xeon processor, model + // 0Fh. All processors are manufactured using the 65 nm process. + case 0x16: // Intel Celeron processor model 16h. All processors are + // manufactured using the 65 nm process + CPU = "core2"; + *Type = INTEL_CORE2; + break; + case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model + // 17h. All processors are manufactured using the 45 nm process. + // + // 45nm: Penryn , Wolfdale, Yorkfield (XE) + case 0x1d: // Intel Xeon processor MP. All processors are manufactured using + // the 45 nm process. + CPU = "penryn"; + *Type = INTEL_CORE2; + break; + case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All + // processors are manufactured using the 45 nm process. + case 0x1e: // Intel(R) Core(TM) i7 CPU 870 @ 2.93GHz. + // As found in a Summer 2010 model iMac. + case 0x1f: + case 0x2e: // Nehalem EX + CPU = "nehalem"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_NEHALEM; + break; + case 0x25: // Intel Core i7, laptop version. + case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All + // processors are manufactured using the 32 nm process. + case 0x2f: // Westmere EX + CPU = "westmere"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_WESTMERE; + break; + case 0x2a: // Intel Core i7 processor. All processors are manufactured + // using the 32 nm process. + case 0x2d: + CPU = "sandybridge"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_SANDYBRIDGE; + break; + case 0x3a: + case 0x3e: // Ivy Bridge EP + CPU = "ivybridge"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_IVYBRIDGE; + break; + + // Haswell: + case 0x3c: + case 0x3f: + case 0x45: + case 0x46: + CPU = "haswell"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_HASWELL; + break; + + // Broadwell: + case 0x3d: + case 0x47: + case 0x4f: + case 0x56: + CPU = "broadwell"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_BROADWELL; + break; + + // Skylake: + case 0x4e: // Skylake mobile + case 0x5e: // Skylake desktop + case 0x8e: // Kaby Lake mobile + case 0x9e: // Kaby Lake desktop + case 0xa5: // Comet Lake-H/S + case 0xa6: // Comet Lake-U + CPU = "skylake"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_SKYLAKE; + break; + + // Rocketlake: + case 0xa7: + CPU = "rocketlake"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_ROCKETLAKE; + break; + + // Skylake Xeon: + case 0x55: + *Type = INTEL_COREI7; + if (testFeature(FEATURE_AVX512BF16)) { + CPU = "cooperlake"; + *Subtype = INTEL_COREI7_COOPERLAKE; + } else if (testFeature(FEATURE_AVX512VNNI)) { + CPU = "cascadelake"; + *Subtype = INTEL_COREI7_CASCADELAKE; + } else { + CPU = "skylake-avx512"; + *Subtype = INTEL_COREI7_SKYLAKE_AVX512; + } + break; + + // Cannonlake: + case 0x66: + CPU = "cannonlake"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_CANNONLAKE; + break; + + // Icelake: + case 0x7d: + case 0x7e: + CPU = "icelake-client"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_ICELAKE_CLIENT; + break; + + // Tigerlake: + case 0x8c: + case 0x8d: + CPU = "tigerlake"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_TIGERLAKE; + break; + + // Alderlake: + case 0x97: + case 0x9a: + // Raptorlake: + case 0xb7: + case 0xba: + case 0xbf: + // Meteorlake: + case 0xaa: + case 0xac: + // Gracemont: + case 0xbe: + CPU = "alderlake"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_ALDERLAKE; + break; + + // Arrowlake: + case 0xc5: + CPU = "arrowlake"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_ARROWLAKE; + break; + + // Arrowlake S: + case 0xc6: + // Lunarlake: + case 0xbd: + CPU = "arrowlake-s"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_ARROWLAKE_S; + break; + + // Pantherlake: + case 0xcc: + CPU = "pantherlake"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_PANTHERLAKE; + break; + + // Icelake Xeon: + case 0x6a: + case 0x6c: + CPU = "icelake-server"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_ICELAKE_SERVER; + break; + + // Emerald Rapids: + case 0xcf: + // Sapphire Rapids: + case 0x8f: + CPU = "sapphirerapids"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_SAPPHIRERAPIDS; + break; + + // Granite Rapids: + case 0xad: + CPU = "graniterapids"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_GRANITERAPIDS; + break; + + // Granite Rapids D: + case 0xae: + CPU = "graniterapids-d"; + *Type = INTEL_COREI7; + *Subtype = INTEL_COREI7_GRANITERAPIDS_D; + break; + + case 0x1c: // Most 45 nm Intel Atom processors + case 0x26: // 45 nm Atom Lincroft + case 0x27: // 32 nm Atom Medfield + case 0x35: // 32 nm Atom Midview + case 0x36: // 32 nm Atom Midview + CPU = "bonnell"; + *Type = INTEL_BONNELL; + break; + + // Atom Silvermont codes from the Intel software optimization guide. + case 0x37: + case 0x4a: + case 0x4d: + case 0x5a: + case 0x5d: + case 0x4c: // really airmont + CPU = "silvermont"; + *Type = INTEL_SILVERMONT; + break; + // Goldmont: + case 0x5c: // Apollo Lake + case 0x5f: // Denverton + CPU = "goldmont"; + *Type = INTEL_GOLDMONT; + break; // "goldmont" + case 0x7a: + CPU = "goldmont-plus"; + *Type = INTEL_GOLDMONT_PLUS; + break; + case 0x86: + case 0x8a: // Lakefield + case 0x96: // Elkhart Lake + case 0x9c: // Jasper Lake + CPU = "tremont"; + *Type = INTEL_TREMONT; + break; + + // Sierraforest: + case 0xaf: + CPU = "sierraforest"; + *Type = INTEL_SIERRAFOREST; + break; + + // Grandridge: + case 0xb6: + CPU = "grandridge"; + *Type = INTEL_GRANDRIDGE; + break; + + // Clearwaterforest: + case 0xdd: + CPU = "clearwaterforest"; + *Type = INTEL_COREI7; + *Subtype = INTEL_CLEARWATERFOREST; + break; + + case 0x57: + CPU = "knl"; + *Type = INTEL_KNL; + break; + + case 0x85: + CPU = "knm"; + *Type = INTEL_KNM; + break; + + default: // Unknown family 6 CPU. + break; + } + break; + default: + break; // Unknown. + } + + return CPU; +} + +static const char *getAMDProcessorTypeAndSubtype(unsigned Family, + unsigned Model, + const unsigned *Features, + unsigned *Type, + unsigned *Subtype) { + const char *CPU = 0; + + switch (Family) { + case 4: + CPU = "i486"; + break; + case 5: + CPU = "pentium"; + switch (Model) { + case 6: + case 7: + CPU = "k6"; + break; + case 8: + CPU = "k6-2"; + break; + case 9: + case 13: + CPU = "k6-3"; + break; + case 10: + CPU = "geode"; + break; + } + break; + case 6: + if (testFeature(FEATURE_SSE)) { + CPU = "athlon-xp"; + break; + } + CPU = "athlon"; + break; + case 15: + if (testFeature(FEATURE_SSE3)) { + CPU = "k8-sse3"; + break; + } + CPU = "k8"; + break; + case 16: + CPU = "amdfam10"; + *Type = AMDFAM10H; // "amdfam10" + switch (Model) { + case 2: + *Subtype = AMDFAM10H_BARCELONA; + break; + case 4: + *Subtype = AMDFAM10H_SHANGHAI; + break; + case 8: + *Subtype = AMDFAM10H_ISTANBUL; + break; + } + break; + case 20: + CPU = "btver1"; + *Type = AMD_BTVER1; + break; + case 21: + CPU = "bdver1"; + *Type = AMDFAM15H; + if (Model >= 0x60 && Model <= 0x7f) { + CPU = "bdver4"; + *Subtype = AMDFAM15H_BDVER4; + break; // 60h-7Fh: Excavator + } + if (Model >= 0x30 && Model <= 0x3f) { + CPU = "bdver3"; + *Subtype = AMDFAM15H_BDVER3; + break; // 30h-3Fh: Steamroller + } + if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) { + CPU = "bdver2"; + *Subtype = AMDFAM15H_BDVER2; + break; // 02h, 10h-1Fh: Piledriver + } + if (Model <= 0x0f) { + *Subtype = AMDFAM15H_BDVER1; + break; // 00h-0Fh: Bulldozer + } + break; + case 22: + CPU = "btver2"; + *Type = AMD_BTVER2; + break; + case 23: + CPU = "znver1"; + *Type = AMDFAM17H; + if ((Model >= 0x30 && Model <= 0x3f) || (Model == 0x47) || + (Model >= 0x60 && Model <= 0x67) || (Model >= 0x68 && Model <= 0x6f) || + (Model >= 0x70 && Model <= 0x7f) || (Model >= 0x84 && Model <= 0x87) || + (Model >= 0x90 && Model <= 0x97) || (Model >= 0x98 && Model <= 0x9f) || + (Model >= 0xa0 && Model <= 0xaf)) { + // Family 17h Models 30h-3Fh (Starship) Zen 2 + // Family 17h Models 47h (Cardinal) Zen 2 + // Family 17h Models 60h-67h (Renoir) Zen 2 + // Family 17h Models 68h-6Fh (Lucienne) Zen 2 + // Family 17h Models 70h-7Fh (Matisse) Zen 2 + // Family 17h Models 84h-87h (ProjectX) Zen 2 + // Family 17h Models 90h-97h (VanGogh) Zen 2 + // Family 17h Models 98h-9Fh (Mero) Zen 2 + // Family 17h Models A0h-AFh (Mendocino) Zen 2 + CPU = "znver2"; + *Subtype = AMDFAM17H_ZNVER2; + break; + } + if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x20 && Model <= 0x2f)) { + // Family 17h Models 10h-1Fh (Raven1) Zen + // Family 17h Models 10h-1Fh (Picasso) Zen+ + // Family 17h Models 20h-2Fh (Raven2 x86) Zen + *Subtype = AMDFAM17H_ZNVER1; + break; + } + break; + case 25: + CPU = "znver3"; + *Type = AMDFAM19H; + if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x2f) || + (Model >= 0x30 && Model <= 0x3f) || (Model >= 0x40 && Model <= 0x4f) || + (Model >= 0x50 && Model <= 0x5f)) { + // Family 19h Models 00h-0Fh (Genesis, Chagall) Zen 3 + // Family 19h Models 20h-2Fh (Vermeer) Zen 3 + // Family 19h Models 30h-3Fh (Badami) Zen 3 + // Family 19h Models 40h-4Fh (Rembrandt) Zen 3+ + // Family 19h Models 50h-5Fh (Cezanne) Zen 3 + *Subtype = AMDFAM19H_ZNVER3; + break; + } + if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x6f) || + (Model >= 0x70 && Model <= 0x77) || (Model >= 0x78 && Model <= 0x7f) || + (Model >= 0xa0 && Model <= 0xaf)) { + // Family 19h Models 10h-1Fh (Stones; Storm Peak) Zen 4 + // Family 19h Models 60h-6Fh (Raphael) Zen 4 + // Family 19h Models 70h-77h (Phoenix, Hawkpoint1) Zen 4 + // Family 19h Models 78h-7Fh (Phoenix 2, Hawkpoint2) Zen 4 + // Family 19h Models A0h-AFh (Stones-Dense) Zen 4 + CPU = "znver4"; + *Subtype = AMDFAM19H_ZNVER4; + break; // "znver4" + } + break; // family 19h + default: + break; // Unknown AMD CPU. + } + + return CPU; +} + +#undef testFeature + +static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, + unsigned *Features) { + unsigned EAX = 0, EBX = 0; + +#define hasFeature(F) ((Features[F / 32] >> (F % 32)) & 1) +#define setFeature(F) Features[F / 32] |= 1U << (F % 32) + + if ((EDX >> 15) & 1) + setFeature(FEATURE_CMOV); + if ((EDX >> 23) & 1) + setFeature(FEATURE_MMX); + if ((EDX >> 25) & 1) + setFeature(FEATURE_SSE); + if ((EDX >> 26) & 1) + setFeature(FEATURE_SSE2); + + if ((ECX >> 0) & 1) + setFeature(FEATURE_SSE3); + if ((ECX >> 1) & 1) + setFeature(FEATURE_PCLMUL); + if ((ECX >> 9) & 1) + setFeature(FEATURE_SSSE3); + if ((ECX >> 12) & 1) + setFeature(FEATURE_FMA); + if ((ECX >> 13) & 1) + setFeature(FEATURE_CMPXCHG16B); + if ((ECX >> 19) & 1) + setFeature(FEATURE_SSE4_1); + if ((ECX >> 20) & 1) + setFeature(FEATURE_SSE4_2); + if ((ECX >> 22) & 1) + setFeature(FEATURE_MOVBE); + if ((ECX >> 23) & 1) + setFeature(FEATURE_POPCNT); + if ((ECX >> 25) & 1) + setFeature(FEATURE_AES); + if ((ECX >> 29) & 1) + setFeature(FEATURE_F16C); + if ((ECX >> 30) & 1) + setFeature(FEATURE_RDRND); + + // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV + // indicates that the AVX registers will be saved and restored on context + // switch, then we have full AVX support. + const unsigned AVXBits = (1 << 27) | (1 << 28); + bool HasAVXSave = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) && + ((EAX & 0x6) == 0x6); +#if defined(__APPLE__) + // Darwin lazily saves the AVX512 context on first use: trust that the OS will + // save the AVX512 context if we use AVX512 instructions, even the bit is not + // set right now. + bool HasAVX512Save = true; +#else + // AVX512 requires additional context to be saved by the OS. + bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0); +#endif + // AMX requires additional context to be saved by the OS. + const unsigned AMXBits = (1 << 17) | (1 << 18); + bool HasXSave = ((ECX >> 27) & 1) && !getX86XCR0(&EAX, &EDX); + bool HasAMXSave = HasXSave && ((EAX & AMXBits) == AMXBits); + + if (HasAVXSave) + setFeature(FEATURE_AVX); + + if (((ECX >> 26) & 1) && HasAVXSave) + setFeature(FEATURE_XSAVE); + + bool HasLeaf7 = + MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX); + + if (HasLeaf7 && ((EBX >> 0) & 1)) + setFeature(FEATURE_FSGSBASE); + if (HasLeaf7 && ((EBX >> 2) & 1)) + setFeature(FEATURE_SGX); + if (HasLeaf7 && ((EBX >> 3) & 1)) + setFeature(FEATURE_BMI); + if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVXSave) + setFeature(FEATURE_AVX2); + if (HasLeaf7 && ((EBX >> 8) & 1)) + setFeature(FEATURE_BMI2); + if (HasLeaf7 && ((EBX >> 11) & 1)) + setFeature(FEATURE_RTM); + if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512F); + if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512DQ); + if (HasLeaf7 && ((EBX >> 18) & 1)) + setFeature(FEATURE_RDSEED); + if (HasLeaf7 && ((EBX >> 19) & 1)) + setFeature(FEATURE_ADX); + if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512IFMA); + if (HasLeaf7 && ((EBX >> 24) & 1)) + setFeature(FEATURE_CLWB); + if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512PF); + if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512ER); + if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512CD); + if (HasLeaf7 && ((EBX >> 29) & 1)) + setFeature(FEATURE_SHA); + if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512BW); + if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512VL); + + if (HasLeaf7 && ((ECX >> 0) & 1)) + setFeature(FEATURE_PREFETCHWT1); + if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512VBMI); + if (HasLeaf7 && ((ECX >> 4) & 1)) + setFeature(FEATURE_PKU); + if (HasLeaf7 && ((ECX >> 5) & 1)) + setFeature(FEATURE_WAITPKG); + if (HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512VBMI2); + if (HasLeaf7 && ((ECX >> 7) & 1)) + setFeature(FEATURE_SHSTK); + if (HasLeaf7 && ((ECX >> 8) & 1)) + setFeature(FEATURE_GFNI); + if (HasLeaf7 && ((ECX >> 9) & 1) && HasAVXSave) + setFeature(FEATURE_VAES); + if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVXSave) + setFeature(FEATURE_VPCLMULQDQ); + if (HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512VNNI); + if (HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512BITALG); + if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512VPOPCNTDQ); + if (HasLeaf7 && ((ECX >> 22) & 1)) + setFeature(FEATURE_RDPID); + if (HasLeaf7 && ((ECX >> 23) & 1)) + setFeature(FEATURE_KL); + if (HasLeaf7 && ((ECX >> 25) & 1)) + setFeature(FEATURE_CLDEMOTE); + if (HasLeaf7 && ((ECX >> 27) & 1)) + setFeature(FEATURE_MOVDIRI); + if (HasLeaf7 && ((ECX >> 28) & 1)) + setFeature(FEATURE_MOVDIR64B); + if (HasLeaf7 && ((ECX >> 29) & 1)) + setFeature(FEATURE_ENQCMD); + + if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX5124VNNIW); + if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX5124FMAPS); + if (HasLeaf7 && ((EDX >> 5) & 1)) + setFeature(FEATURE_UINTR); + if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512VP2INTERSECT); + if (HasLeaf7 && ((EDX >> 14) & 1)) + setFeature(FEATURE_SERIALIZE); + if (HasLeaf7 && ((EDX >> 16) & 1)) + setFeature(FEATURE_TSXLDTRK); + if (HasLeaf7 && ((EDX >> 18) & 1)) + setFeature(FEATURE_PCONFIG); + if (HasLeaf7 && ((EDX >> 22) & 1) && HasAMXSave) + setFeature(FEATURE_AMX_BF16); + if (HasLeaf7 && ((EDX >> 23) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512FP16); + if (HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave) + setFeature(FEATURE_AMX_TILE); + if (HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave) + setFeature(FEATURE_AMX_INT8); + + // EAX from subleaf 0 is the maximum subleaf supported. Some CPUs don't + // return all 0s for invalid subleaves so check the limit. + bool HasLeaf7Subleaf1 = + HasLeaf7 && EAX >= 1 && + !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX); + if (HasLeaf7Subleaf1 && ((EAX >> 0) & 1)) + setFeature(FEATURE_SHA512); + if (HasLeaf7Subleaf1 && ((EAX >> 1) & 1)) + setFeature(FEATURE_SM3); + if (HasLeaf7Subleaf1 && ((EAX >> 2) & 1)) + setFeature(FEATURE_SM4); + if (HasLeaf7Subleaf1 && ((EAX >> 3) & 1)) + setFeature(FEATURE_RAOINT); + if (HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave) + setFeature(FEATURE_AVXVNNI); + if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save) + setFeature(FEATURE_AVX512BF16); + if (HasLeaf7Subleaf1 && ((EAX >> 7) & 1)) + setFeature(FEATURE_CMPCCXADD); + if (HasLeaf7Subleaf1 && ((EAX >> 21) & 1) && HasAMXSave) + setFeature(FEATURE_AMX_FP16); + if (HasLeaf7Subleaf1 && ((EAX >> 22) & 1)) + setFeature(FEATURE_HRESET); + if (HasLeaf7Subleaf1 && ((EAX >> 23) & 1) && HasAVXSave) + setFeature(FEATURE_AVXIFMA); + + if (HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave) + setFeature(FEATURE_AVXVNNIINT8); + if (HasLeaf7Subleaf1 && ((EDX >> 5) & 1) && HasAVXSave) + setFeature(FEATURE_AVXNECONVERT); + if (HasLeaf7Subleaf1 && ((EDX >> 8) & 1) && HasAMXSave) + setFeature(FEATURE_AMX_COMPLEX); + if (HasLeaf7Subleaf1 && ((EDX >> 10) & 1) && HasAVXSave) + setFeature(FEATURE_AVXVNNIINT16); + if (HasLeaf7Subleaf1 && ((EDX >> 14) & 1)) + setFeature(FEATURE_PREFETCHI); + if (HasLeaf7Subleaf1 && ((EDX >> 15) & 1)) + setFeature(FEATURE_USERMSR); + if (HasLeaf7Subleaf1 && ((EDX >> 21) & 1)) + setFeature(FEATURE_APXF); + + unsigned MaxLevel = 0; + getX86CpuIDAndInfo(0, &MaxLevel, &EBX, &ECX, &EDX); + bool HasLeafD = MaxLevel >= 0xd && + !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX); + if (HasLeafD && ((EAX >> 0) & 1) && HasAVXSave) + setFeature(FEATURE_XSAVEOPT); + if (HasLeafD && ((EAX >> 1) & 1) && HasAVXSave) + setFeature(FEATURE_XSAVEC); + if (HasLeafD && ((EAX >> 3) & 1) && HasAVXSave) + setFeature(FEATURE_XSAVES); + + bool HasLeaf24 = + MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX); + if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1) && HasLeaf24) { + bool Has512Len = (EBX >> 18) & 1; + int AVX10Ver = EBX & 0xff; + if (AVX10Ver >= 2) { + setFeature(FEATURE_AVX10_2_256); + if (Has512Len) + setFeature(FEATURE_AVX10_2_512); + } + if (AVX10Ver >= 1) { + setFeature(FEATURE_AVX10_1_256); + if (Has512Len) + setFeature(FEATURE_AVX10_1_512); + } + } + + unsigned MaxExtLevel = 0; + getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX); + + bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 && + !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); + if (HasExtLeaf1) { + if (ECX & 1) + setFeature(FEATURE_LAHF_LM); + if ((ECX >> 5) & 1) + setFeature(FEATURE_LZCNT); + if (((ECX >> 6) & 1)) + setFeature(FEATURE_SSE4_A); + if (((ECX >> 8) & 1)) + setFeature(FEATURE_PRFCHW); + if (((ECX >> 11) & 1)) + setFeature(FEATURE_XOP); + if (((ECX >> 15) & 1)) + setFeature(FEATURE_LWP); + if (((ECX >> 16) & 1)) + setFeature(FEATURE_FMA4); + if (((ECX >> 21) & 1)) + setFeature(FEATURE_TBM); + if (((ECX >> 29) & 1)) + setFeature(FEATURE_MWAITX); + + if (((EDX >> 29) & 1)) + setFeature(FEATURE_LM); + } + + bool HasExtLeaf8 = MaxExtLevel >= 0x80000008 && + !getX86CpuIDAndInfo(0x80000008, &EAX, &EBX, &ECX, &EDX); + if (HasExtLeaf8 && ((EBX >> 0) & 1)) + setFeature(FEATURE_CLZERO); + if (HasExtLeaf8 && ((EBX >> 9) & 1)) + setFeature(FEATURE_WBNOINVD); + + bool HasLeaf14 = MaxLevel >= 0x14 && + !getX86CpuIDAndInfoEx(0x14, 0x0, &EAX, &EBX, &ECX, &EDX); + if (HasLeaf14 && ((EBX >> 4) & 1)) + setFeature(FEATURE_PTWRITE); + + bool HasLeaf19 = + MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX); + if (HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1)) + setFeature(FEATURE_WIDEKL); + + if (hasFeature(FEATURE_LM) && hasFeature(FEATURE_SSE2)) { + setFeature(FEATURE_X86_64_BASELINE); + if (hasFeature(FEATURE_CMPXCHG16B) && hasFeature(FEATURE_POPCNT) && + hasFeature(FEATURE_LAHF_LM) && hasFeature(FEATURE_SSE4_2)) { + setFeature(FEATURE_X86_64_V2); + if (hasFeature(FEATURE_AVX2) && hasFeature(FEATURE_BMI) && + hasFeature(FEATURE_BMI2) && hasFeature(FEATURE_F16C) && + hasFeature(FEATURE_FMA) && hasFeature(FEATURE_LZCNT) && + hasFeature(FEATURE_MOVBE)) { + setFeature(FEATURE_X86_64_V3); + if (hasFeature(FEATURE_AVX512BW) && hasFeature(FEATURE_AVX512CD) && + hasFeature(FEATURE_AVX512DQ) && hasFeature(FEATURE_AVX512VL)) + setFeature(FEATURE_X86_64_V4); + } + } + } + +#undef hasFeature +#undef setFeature +} + +#ifndef _WIN32 +__attribute__((visibility("hidden"))) +#endif +int __cpu_indicator_init(void) CONSTRUCTOR_ATTRIBUTE; + +#ifndef _WIN32 +__attribute__((visibility("hidden"))) +#endif +struct __processor_model { + unsigned int __cpu_vendor; + unsigned int __cpu_type; + unsigned int __cpu_subtype; + unsigned int __cpu_features[1]; +} __cpu_model = {0, 0, 0, {0}}; + +#ifndef _WIN32 +__attribute__((visibility("hidden"))) +#endif +unsigned __cpu_features2[(CPU_FEATURE_MAX - 1) / 32]; + +// A constructor function that is sets __cpu_model and __cpu_features2 with +// the right values. This needs to run only once. This constructor is +// given the highest priority and it should run before constructors without +// the priority set. However, it still runs after ifunc initializers and +// needs to be called explicitly there. + +int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { + unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; + unsigned MaxLeaf = 5; + unsigned Vendor; + unsigned Model, Family; + unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0}; + static_assert(sizeof(Features) / sizeof(Features[0]) == 4, ""); + static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, ""); + + // This function needs to run just once. + if (__cpu_model.__cpu_vendor) + return 0; + + if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) { + __cpu_model.__cpu_vendor = VENDOR_OTHER; + return -1; + } + + getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX); + detectX86FamilyModel(EAX, &Family, &Model); + + // Find available features. + getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]); + + __cpu_model.__cpu_features[0] = Features[0]; + __cpu_features2[0] = Features[1]; + __cpu_features2[1] = Features[2]; + __cpu_features2[2] = Features[3]; + + if (Vendor == SIG_INTEL) { + // Get CPU type. + getIntelProcessorTypeAndSubtype(Family, Model, &Features[0], + &(__cpu_model.__cpu_type), + &(__cpu_model.__cpu_subtype)); + __cpu_model.__cpu_vendor = VENDOR_INTEL; + } else if (Vendor == SIG_AMD) { + // Get CPU type. + getAMDProcessorTypeAndSubtype(Family, Model, &Features[0], + &(__cpu_model.__cpu_type), + &(__cpu_model.__cpu_subtype)); + __cpu_model.__cpu_vendor = VENDOR_AMD; + } else + __cpu_model.__cpu_vendor = VENDOR_OTHER; + + assert(__cpu_model.__cpu_vendor < VENDOR_MAX); + assert(__cpu_model.__cpu_type < CPU_TYPE_MAX); + assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX); + + return 0; +} +#endif // defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) diff --git a/compiler-rt/lib/builtins/crtbegin.c b/compiler-rt/lib/builtins/crtbegin.c new file mode 100644 index 0000000..d5f7756 --- /dev/null +++ b/compiler-rt/lib/builtins/crtbegin.c @@ -0,0 +1,171 @@ +//===-- crtbegin.c - Start of constructors and destructors ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#ifndef __has_feature +# define __has_feature(x) 0 +#endif + +#if __has_feature(ptrauth_init_fini) +#include +#endif + +__attribute__((visibility("hidden"))) void *__dso_handle = &__dso_handle; + +#ifdef EH_USE_FRAME_REGISTRY +__extension__ static void *__EH_FRAME_LIST__[] + __attribute__((section(".eh_frame"), aligned(sizeof(void *)))) = {}; + +extern void __register_frame_info(const void *, void *) __attribute__((weak)); +extern void *__deregister_frame_info(const void *) __attribute__((weak)); +#endif + +#ifndef CRT_HAS_INITFINI_ARRAY +typedef void (*fp)(void); + +static fp __CTOR_LIST__[] + __attribute__((section(".ctors"), aligned(sizeof(fp)))) = {(fp)-1}; +extern fp __CTOR_LIST_END__[]; +#endif + +extern void __cxa_finalize(void *) __attribute__((weak)); + +static void __attribute__((used)) __do_init(void) { + static _Bool __initialized; + if (__builtin_expect(__initialized, 0)) + return; + __initialized = 1; + +#ifdef EH_USE_FRAME_REGISTRY + static struct { void *p[8]; } __object; + if (__register_frame_info) + __register_frame_info(__EH_FRAME_LIST__, &__object); +#endif +#ifndef CRT_HAS_INITFINI_ARRAY + const size_t n = __CTOR_LIST_END__ - __CTOR_LIST__ - 1; + for (size_t i = n; i >= 1; i--) __CTOR_LIST__[i](); +#endif +} + +#ifdef CRT_HAS_INITFINI_ARRAY +#if __has_feature(ptrauth_init_fini) +// TODO: use __ptrauth-qualified pointers when they are supported on clang side +#if __has_feature(ptrauth_init_fini_address_discrimination) +__attribute__((section(".init_array"), used)) static void *__init = + ptrauth_sign_constant(&__do_init, ptrauth_key_init_fini_pointer, + ptrauth_blend_discriminator( + &__init, __ptrauth_init_fini_discriminator)); +#else +__attribute__((section(".init_array"), used)) static void *__init = + ptrauth_sign_constant(&__do_init, ptrauth_key_init_fini_pointer, + __ptrauth_init_fini_discriminator); +#endif +#else +__attribute__((section(".init_array"), + used)) static void (*__init)(void) = __do_init; +#endif +#elif defined(__i386__) || defined(__x86_64__) +__asm__(".pushsection .init,\"ax\",@progbits\n\t" + "call __do_init\n\t" + ".popsection"); +#elif defined(__riscv) +__asm__(".pushsection .init,\"ax\",%progbits\n\t" + "call __do_init\n\t" + ".popsection"); +#elif defined(__arm__) || defined(__aarch64__) +__asm__(".pushsection .init,\"ax\",%progbits\n\t" + "bl __do_init\n\t" + ".popsection"); +#elif defined(__mips__) +__asm__(".pushsection .init,\"ax\",@progbits\n\t" + "jal __do_init\n\t" + ".popsection"); +#elif defined(__powerpc__) || defined(__powerpc64__) +__asm__(".pushsection .init,\"ax\",@progbits\n\t" + "bl __do_init\n\t" + "nop\n\t" + ".popsection"); +#elif defined(__sparc__) +__asm__(".pushsection .init,\"ax\",@progbits\n\t" + "call __do_init\n\t" + ".popsection"); +#else +#error "crtbegin without .init_fini array unimplemented for this architecture" +#endif // CRT_HAS_INITFINI_ARRAY + +#ifndef CRT_HAS_INITFINI_ARRAY +static fp __DTOR_LIST__[] + __attribute__((section(".dtors"), aligned(sizeof(fp)))) = {(fp)-1}; +extern fp __DTOR_LIST_END__[]; +#endif + +static void __attribute__((used)) __do_fini(void) { + static _Bool __finalized; + if (__builtin_expect(__finalized, 0)) + return; + __finalized = 1; + + if (__cxa_finalize) + __cxa_finalize(__dso_handle); + +#ifndef CRT_HAS_INITFINI_ARRAY + const size_t n = __DTOR_LIST_END__ - __DTOR_LIST__ - 1; + for (size_t i = 1; i <= n; i++) __DTOR_LIST__[i](); +#endif +#ifdef EH_USE_FRAME_REGISTRY + if (__deregister_frame_info) + __deregister_frame_info(__EH_FRAME_LIST__); +#endif +} + +#ifdef CRT_HAS_INITFINI_ARRAY +#if __has_feature(ptrauth_init_fini) +// TODO: use __ptrauth-qualified pointers when they are supported on clang side +#if __has_feature(ptrauth_init_fini_address_discrimination) +__attribute__((section(".fini_array"), used)) static void *__fini = + ptrauth_sign_constant(&__do_fini, ptrauth_key_init_fini_pointer, + ptrauth_blend_discriminator( + &__fini, __ptrauth_init_fini_discriminator)); +#else +__attribute__((section(".fini_array"), used)) static void *__fini = + ptrauth_sign_constant(&__do_fini, ptrauth_key_init_fini_pointer, + __ptrauth_init_fini_discriminator); +#endif +#else +__attribute__((section(".fini_array"), + used)) static void (*__fini)(void) = __do_fini; +#endif +#elif defined(__i386__) || defined(__x86_64__) +__asm__(".pushsection .fini,\"ax\",@progbits\n\t" + "call __do_fini\n\t" + ".popsection"); +#elif defined(__arm__) || defined(__aarch64__) +__asm__(".pushsection .fini,\"ax\",%progbits\n\t" + "bl __do_fini\n\t" + ".popsection"); +#elif defined(__mips__) +__asm__(".pushsection .fini,\"ax\",@progbits\n\t" + "jal __do_fini\n\t" + ".popsection"); +#elif defined(__powerpc__) || defined(__powerpc64__) +__asm__(".pushsection .fini,\"ax\",@progbits\n\t" + "bl __do_fini\n\t" + "nop\n\t" + ".popsection"); +#elif defined(__riscv) +__asm__(".pushsection .fini,\"ax\",@progbits\n\t" + "call __do_fini\n\t" + ".popsection"); +#elif defined(__sparc__) +__asm__(".pushsection .fini,\"ax\",@progbits\n\t" + "call __do_fini\n\t" + ".popsection"); +#else +#error "crtbegin without .init_fini array unimplemented for this architecture" +#endif // CRT_HAS_INIT_FINI_ARRAY diff --git a/compiler-rt/lib/builtins/crtend.c b/compiler-rt/lib/builtins/crtend.c new file mode 100644 index 0000000..ebcc60b --- /dev/null +++ b/compiler-rt/lib/builtins/crtend.c @@ -0,0 +1,22 @@ +//===-- crtend.c - End of constructors and destructors --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +// Put 4-byte zero which is the length field in FDE at the end as a terminator. +const int32_t __EH_FRAME_LIST_END__[] + __attribute__((section(".eh_frame"), aligned(sizeof(int32_t)), + visibility("hidden"), used)) = {0}; + +#ifndef CRT_HAS_INITFINI_ARRAY +typedef void (*fp)(void); +fp __CTOR_LIST_END__[] + __attribute__((section(".ctors"), visibility("hidden"), used)) = {0}; +fp __DTOR_LIST_END__[] + __attribute__((section(".dtors"), visibility("hidden"), used)) = {0}; +#endif diff --git a/compiler-rt/lib/builtins/divmoddi4.c b/compiler-rt/lib/builtins/divmoddi4.c index e7cbbb1..64bbb69 100644 --- a/compiler-rt/lib/builtins/divmoddi4.c +++ b/compiler-rt/lib/builtins/divmoddi4.c @@ -18,8 +18,8 @@ COMPILER_RT_ABI di_int __divmoddi4(di_int a, di_int b, di_int *rem) { const int bits_in_dword_m1 = (int)(sizeof(di_int) * CHAR_BIT) - 1; di_int s_a = a >> bits_in_dword_m1; // s_a = a < 0 ? -1 : 0 di_int s_b = b >> bits_in_dword_m1; // s_b = b < 0 ? -1 : 0 - a = (a ^ s_a) - s_a; // negate if s_a == -1 - b = (b ^ s_b) - s_b; // negate if s_b == -1 + a = (du_int)(a ^ s_a) - s_a; // negate if s_a == -1 + b = (du_int)(b ^ s_b) - s_b; // negate if s_b == -1 s_b ^= s_a; // sign of quotient du_int r; di_int q = (__udivmoddi4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1 diff --git a/compiler-rt/lib/builtins/divmodsi4.c b/compiler-rt/lib/builtins/divmodsi4.c index a85e299..193f810 100644 --- a/compiler-rt/lib/builtins/divmodsi4.c +++ b/compiler-rt/lib/builtins/divmodsi4.c @@ -19,8 +19,8 @@ COMPILER_RT_ABI si_int __divmodsi4(si_int a, si_int b, si_int *rem) { const int bits_in_word_m1 = (int)(sizeof(si_int) * CHAR_BIT) - 1; si_int s_a = a >> bits_in_word_m1; // s_a = a < 0 ? -1 : 0 si_int s_b = b >> bits_in_word_m1; // s_b = b < 0 ? -1 : 0 - a = (a ^ s_a) - s_a; // negate if s_a == -1 - b = (b ^ s_b) - s_b; // negate if s_b == -1 + a = (su_int)(a ^ s_a) - s_a; // negate if s_a == -1 + b = (su_int)(b ^ s_b) - s_b; // negate if s_b == -1 s_b ^= s_a; // sign of quotient su_int r; si_int q = (__udivmodsi4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1 diff --git a/compiler-rt/lib/builtins/divmodti4.c b/compiler-rt/lib/builtins/divmodti4.c index b243ba4..185d3d4 100644 --- a/compiler-rt/lib/builtins/divmodti4.c +++ b/compiler-rt/lib/builtins/divmodti4.c @@ -20,8 +20,8 @@ COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, ti_int *rem) { const int bits_in_tword_m1 = (int)(sizeof(ti_int) * CHAR_BIT) - 1; ti_int s_a = a >> bits_in_tword_m1; // s_a = a < 0 ? -1 : 0 ti_int s_b = b >> bits_in_tword_m1; // s_b = b < 0 ? -1 : 0 - a = (a ^ s_a) - s_a; // negate if s_a == -1 - b = (b ^ s_b) - s_b; // negate if s_b == -1 + a = (tu_int)(a ^ s_a) - s_a; // negate if s_a == -1 + b = (tu_int)(b ^ s_b) - s_b; // negate if s_b == -1 s_b ^= s_a; // sign of quotient tu_int r; ti_int q = (__udivmodti4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1 diff --git a/compiler-rt/lib/builtins/divtc3.c b/compiler-rt/lib/builtins/divtc3.c index 0e47992..c393de8 100644 --- a/compiler-rt/lib/builtins/divtc3.c +++ b/compiler-rt/lib/builtins/divtc3.c @@ -12,44 +12,45 @@ #define QUAD_PRECISION #include "fp_lib.h" -#include "int_lib.h" -#include "int_math.h" + +#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128) // Returns: the quotient of (a + ib) / (c + id) -COMPILER_RT_ABI Lcomplex __divtc3(long double __a, long double __b, - long double __c, long double __d) { +COMPILER_RT_ABI Qcomplex __divtc3(fp_t __a, fp_t __b, fp_t __c, fp_t __d) { int __ilogbw = 0; - long double __logbw = - __compiler_rt_logbl(__compiler_rt_fmaxl(crt_fabsl(__c), crt_fabsl(__d))); + fp_t __logbw = __compiler_rt_logbtf( + __compiler_rt_fmaxtf(crt_fabstf(__c), crt_fabstf(__d))); if (crt_isfinite(__logbw)) { __ilogbw = (int)__logbw; - __c = __compiler_rt_scalbnl(__c, -__ilogbw); - __d = __compiler_rt_scalbnl(__d, -__ilogbw); + __c = __compiler_rt_scalbntf(__c, -__ilogbw); + __d = __compiler_rt_scalbntf(__d, -__ilogbw); } - long double __denom = __c * __c + __d * __d; - Lcomplex z; - COMPLEX_REAL(z) = - __compiler_rt_scalbnl((__a * __c + __b * __d) / __denom, -__ilogbw); - COMPLEX_IMAGINARY(z) = - __compiler_rt_scalbnl((__b * __c - __a * __d) / __denom, -__ilogbw); - if (crt_isnan(COMPLEX_REAL(z)) && crt_isnan(COMPLEX_IMAGINARY(z))) { + fp_t __denom = __c * __c + __d * __d; + Qcomplex z; + COMPLEXTF_REAL(z) = + __compiler_rt_scalbntf((__a * __c + __b * __d) / __denom, -__ilogbw); + COMPLEXTF_IMAGINARY(z) = + __compiler_rt_scalbntf((__b * __c - __a * __d) / __denom, -__ilogbw); + if (crt_isnan(COMPLEXTF_REAL(z)) && crt_isnan(COMPLEXTF_IMAGINARY(z))) { if ((__denom == 0.0) && (!crt_isnan(__a) || !crt_isnan(__b))) { - COMPLEX_REAL(z) = crt_copysignl(CRT_INFINITY, __c) * __a; - COMPLEX_IMAGINARY(z) = crt_copysignl(CRT_INFINITY, __c) * __b; + COMPLEXTF_REAL(z) = crt_copysigntf(CRT_INFINITY, __c) * __a; + COMPLEXTF_IMAGINARY(z) = crt_copysigntf(CRT_INFINITY, __c) * __b; } else if ((crt_isinf(__a) || crt_isinf(__b)) && crt_isfinite(__c) && crt_isfinite(__d)) { - __a = crt_copysignl(crt_isinf(__a) ? 1.0 : 0.0, __a); - __b = crt_copysignl(crt_isinf(__b) ? 1.0 : 0.0, __b); - COMPLEX_REAL(z) = CRT_INFINITY * (__a * __c + __b * __d); - COMPLEX_IMAGINARY(z) = CRT_INFINITY * (__b * __c - __a * __d); + __a = crt_copysigntf(crt_isinf(__a) ? (fp_t)1.0 : (fp_t)0.0, __a); + __b = crt_copysigntf(crt_isinf(__b) ? (fp_t)1.0 : (fp_t)0.0, __b); + COMPLEXTF_REAL(z) = CRT_INFINITY * (__a * __c + __b * __d); + COMPLEXTF_IMAGINARY(z) = CRT_INFINITY * (__b * __c - __a * __d); } else if (crt_isinf(__logbw) && __logbw > 0.0 && crt_isfinite(__a) && crt_isfinite(__b)) { - __c = crt_copysignl(crt_isinf(__c) ? 1.0 : 0.0, __c); - __d = crt_copysignl(crt_isinf(__d) ? 1.0 : 0.0, __d); - COMPLEX_REAL(z) = 0.0 * (__a * __c + __b * __d); - COMPLEX_IMAGINARY(z) = 0.0 * (__b * __c - __a * __d); + __c = crt_copysigntf(crt_isinf(__c) ? (fp_t)1.0 : (fp_t)0.0, __c); + __d = crt_copysigntf(crt_isinf(__d) ? (fp_t)1.0 : (fp_t)0.0, __d); + COMPLEXTF_REAL(z) = 0.0 * (__a * __c + __b * __d); + COMPLEXTF_IMAGINARY(z) = 0.0 * (__b * __c - __a * __d); } } return z; } + +#endif diff --git a/compiler-rt/lib/builtins/divtf3.c b/compiler-rt/lib/builtins/divtf3.c index 5bcc9a8..bd76763 100644 --- a/compiler-rt/lib/builtins/divtf3.c +++ b/compiler-rt/lib/builtins/divtf3.c @@ -14,7 +14,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) #define NUMBER_OF_HALF_ITERATIONS 4 #define NUMBER_OF_FULL_ITERATIONS 1 diff --git a/compiler-rt/lib/builtins/divxc3.c b/compiler-rt/lib/builtins/divxc3.c index 97ffd2e..3423334 100644 --- a/compiler-rt/lib/builtins/divxc3.c +++ b/compiler-rt/lib/builtins/divxc3.c @@ -17,16 +17,16 @@ // Returns: the quotient of (a + ib) / (c + id) -COMPILER_RT_ABI Lcomplex __divxc3(long double __a, long double __b, - long double __c, long double __d) { +COMPILER_RT_ABI Lcomplex __divxc3(xf_float __a, xf_float __b, xf_float __c, + xf_float __d) { int __ilogbw = 0; - long double __logbw = crt_logbl(crt_fmaxl(crt_fabsl(__c), crt_fabsl(__d))); + xf_float __logbw = crt_logbl(crt_fmaxl(crt_fabsl(__c), crt_fabsl(__d))); if (crt_isfinite(__logbw)) { __ilogbw = (int)__logbw; __c = crt_scalbnl(__c, -__ilogbw); __d = crt_scalbnl(__d, -__ilogbw); } - long double __denom = __c * __c + __d * __d; + xf_float __denom = __c * __c + __d * __d; Lcomplex z; COMPLEX_REAL(z) = crt_scalbnl((__a * __c + __b * __d) / __denom, -__ilogbw); COMPLEX_IMAGINARY(z) = diff --git a/compiler-rt/lib/builtins/extendbfsf2.c b/compiler-rt/lib/builtins/extendbfsf2.c new file mode 100644 index 0000000..e159d79 --- /dev/null +++ b/compiler-rt/lib/builtins/extendbfsf2.c @@ -0,0 +1,13 @@ +//===-- lib/extendbfsf2.c - bfloat -> single conversion -----------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define SRC_BFLOAT16 +#define DST_SINGLE +#include "fp_extend_impl.inc" + +COMPILER_RT_ABI float __extendbfsf2(src_t a) { return __extendXfYf2__(a); } diff --git a/compiler-rt/lib/builtins/extenddftf2.c b/compiler-rt/lib/builtins/extenddftf2.c index ddf470e..a61ef53 100644 --- a/compiler-rt/lib/builtins/extenddftf2.c +++ b/compiler-rt/lib/builtins/extenddftf2.c @@ -9,13 +9,11 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) #define SRC_DOUBLE #define DST_QUAD #include "fp_extend_impl.inc" -COMPILER_RT_ABI fp_t __extenddftf2(double a) { - return __extendXfYf2__(a); -} +COMPILER_RT_ABI dst_t __extenddftf2(src_t a) { return __extendXfYf2__(a); } #endif diff --git a/compiler-rt/lib/builtins/extendhftf2.c b/compiler-rt/lib/builtins/extendhftf2.c index aefe973..7609db6 100644 --- a/compiler-rt/lib/builtins/extendhftf2.c +++ b/compiler-rt/lib/builtins/extendhftf2.c @@ -10,14 +10,11 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) && \ - defined(COMPILER_RT_HAS_FLOAT16) +#if defined(CRT_HAS_TF_MODE) && defined(COMPILER_RT_HAS_FLOAT16) #define SRC_HALF #define DST_QUAD #include "fp_extend_impl.inc" -COMPILER_RT_ABI long double __extendhftf2(_Float16 a) { - return __extendXfYf2__(a); -} +COMPILER_RT_ABI dst_t __extendhftf2(src_t a) { return __extendXfYf2__(a); } #endif diff --git a/compiler-rt/lib/builtins/extendsftf2.c b/compiler-rt/lib/builtins/extendsftf2.c index cf1fd2f..4ab2982 100644 --- a/compiler-rt/lib/builtins/extendsftf2.c +++ b/compiler-rt/lib/builtins/extendsftf2.c @@ -9,13 +9,11 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) #define SRC_SINGLE #define DST_QUAD #include "fp_extend_impl.inc" -COMPILER_RT_ABI fp_t __extendsftf2(float a) { - return __extendXfYf2__(a); -} +COMPILER_RT_ABI dst_t __extendsftf2(src_t a) { return __extendXfYf2__(a); } #endif diff --git a/compiler-rt/lib/builtins/extendxftf2.c b/compiler-rt/lib/builtins/extendxftf2.c new file mode 100644 index 0000000..c1d97b5 --- /dev/null +++ b/compiler-rt/lib/builtins/extendxftf2.c @@ -0,0 +1,24 @@ +//===-- lib/extendxftf2.c - long double -> quad conversion --------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Assumption: long double is a IEEE 80 bit floating point type padded to 128 +// bits. + +#define QUAD_PRECISION +#include "fp_lib.h" + +#if defined(CRT_HAS_TF_MODE) && __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) +#define SRC_80 +#define DST_QUAD +#include "fp_extend_impl.inc" + +COMPILER_RT_ABI tf_float __extendxftf2(xf_float a) { + return __extendXfYf2__(a); +} + +#endif diff --git a/compiler-rt/lib/builtins/fixtfdi.c b/compiler-rt/lib/builtins/fixtfdi.c index fe570e6..d27a99b 100644 --- a/compiler-rt/lib/builtins/fixtfdi.c +++ b/compiler-rt/lib/builtins/fixtfdi.c @@ -9,7 +9,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) typedef di_int fixint_t; typedef du_int fixuint_t; #include "fp_fixint_impl.inc" diff --git a/compiler-rt/lib/builtins/fixtfsi.c b/compiler-rt/lib/builtins/fixtfsi.c index a32bd96..01e352a 100644 --- a/compiler-rt/lib/builtins/fixtfsi.c +++ b/compiler-rt/lib/builtins/fixtfsi.c @@ -9,7 +9,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) typedef si_int fixint_t; typedef su_int fixuint_t; #include "fp_fixint_impl.inc" diff --git a/compiler-rt/lib/builtins/fixtfti.c b/compiler-rt/lib/builtins/fixtfti.c index 19f84ce..491fca5 100644 --- a/compiler-rt/lib/builtins/fixtfti.c +++ b/compiler-rt/lib/builtins/fixtfti.c @@ -9,7 +9,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) typedef ti_int fixint_t; typedef tu_int fixuint_t; #include "fp_fixint_impl.inc" diff --git a/compiler-rt/lib/builtins/fixunstfdi.c b/compiler-rt/lib/builtins/fixunstfdi.c index a0805e6..febdb8f 100644 --- a/compiler-rt/lib/builtins/fixunstfdi.c +++ b/compiler-rt/lib/builtins/fixunstfdi.c @@ -9,7 +9,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) typedef du_int fixuint_t; #include "fp_fixuint_impl.inc" diff --git a/compiler-rt/lib/builtins/fixunstfsi.c b/compiler-rt/lib/builtins/fixunstfsi.c index 3a1320e..4efc387 100644 --- a/compiler-rt/lib/builtins/fixunstfsi.c +++ b/compiler-rt/lib/builtins/fixunstfsi.c @@ -9,7 +9,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) typedef su_int fixuint_t; #include "fp_fixuint_impl.inc" diff --git a/compiler-rt/lib/builtins/fixunstfti.c b/compiler-rt/lib/builtins/fixunstfti.c index 23cd1ab..fa9e7aa 100644 --- a/compiler-rt/lib/builtins/fixunstfti.c +++ b/compiler-rt/lib/builtins/fixunstfti.c @@ -9,7 +9,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) typedef tu_int fixuint_t; #include "fp_fixuint_impl.inc" diff --git a/compiler-rt/lib/builtins/fixunsxfdi.c b/compiler-rt/lib/builtins/fixunsxfdi.c index c8a8061..957c263 100644 --- a/compiler-rt/lib/builtins/fixunsxfdi.c +++ b/compiler-rt/lib/builtins/fixunsxfdi.c @@ -32,8 +32,8 @@ #pragma warning(disable : 4700) #endif -COMPILER_RT_ABI du_int __fixunsxfdi(long double a) { - long_double_bits fb; +COMPILER_RT_ABI du_int __fixunsxfdi(xf_float a) { + xf_bits fb; fb.f = a; int e = (fb.u.high.s.low & 0x00007FFF) - 16383; if (e < 0 || (fb.u.high.s.low & 0x00008000)) diff --git a/compiler-rt/lib/builtins/fixunsxfsi.c b/compiler-rt/lib/builtins/fixunsxfsi.c index 154abcb..a0abb82 100644 --- a/compiler-rt/lib/builtins/fixunsxfsi.c +++ b/compiler-rt/lib/builtins/fixunsxfsi.c @@ -32,8 +32,8 @@ #pragma warning(disable : 4700) #endif -COMPILER_RT_ABI su_int __fixunsxfsi(long double a) { - long_double_bits fb; +COMPILER_RT_ABI su_int __fixunsxfsi(xf_float a) { + xf_bits fb; fb.f = a; int e = (fb.u.high.s.low & 0x00007FFF) - 16383; if (e < 0 || (fb.u.high.s.low & 0x00008000)) diff --git a/compiler-rt/lib/builtins/fixunsxfti.c b/compiler-rt/lib/builtins/fixunsxfti.c index 508554e..be3f75f 100644 --- a/compiler-rt/lib/builtins/fixunsxfti.c +++ b/compiler-rt/lib/builtins/fixunsxfti.c @@ -25,8 +25,8 @@ // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm // mmmm mmmm mmmm -COMPILER_RT_ABI tu_int __fixunsxfti(long double a) { - long_double_bits fb; +COMPILER_RT_ABI tu_int __fixunsxfti(xf_float a) { + xf_bits fb; fb.f = a; int e = (fb.u.high.s.low & 0x00007FFF) - 16383; if (e < 0 || (fb.u.high.s.low & 0x00008000)) diff --git a/compiler-rt/lib/builtins/fixxfdi.c b/compiler-rt/lib/builtins/fixxfdi.c index 86cf376..35d7083 100644 --- a/compiler-rt/lib/builtins/fixxfdi.c +++ b/compiler-rt/lib/builtins/fixxfdi.c @@ -31,10 +31,10 @@ #pragma warning(disable : 4700) #endif -COMPILER_RT_ABI di_int __fixxfdi(long double a) { +COMPILER_RT_ABI di_int __fixxfdi(xf_float a) { const di_int di_max = (di_int)((~(du_int)0) / 2); const di_int di_min = -di_max - 1; - long_double_bits fb; + xf_bits fb; fb.f = a; int e = (fb.u.high.s.low & 0x00007FFF) - 16383; if (e < 0) diff --git a/compiler-rt/lib/builtins/fixxfti.c b/compiler-rt/lib/builtins/fixxfti.c index 90e0311..95038df 100644 --- a/compiler-rt/lib/builtins/fixxfti.c +++ b/compiler-rt/lib/builtins/fixxfti.c @@ -24,10 +24,10 @@ // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm // mmmm mmmm mmmm -COMPILER_RT_ABI ti_int __fixxfti(long double a) { +COMPILER_RT_ABI ti_int __fixxfti(xf_float a) { const ti_int ti_max = (ti_int)((~(tu_int)0) / 2); const ti_int ti_min = -ti_max - 1; - long_double_bits fb; + xf_bits fb; fb.f = a; int e = (fb.u.high.s.low & 0x00007FFF) - 16383; if (e < 0) diff --git a/compiler-rt/lib/builtins/floatdidf.c b/compiler-rt/lib/builtins/floatdidf.c index d37c43b..6da81f7 100644 --- a/compiler-rt/lib/builtins/floatdidf.c +++ b/compiler-rt/lib/builtins/floatdidf.c @@ -45,53 +45,11 @@ COMPILER_RT_ABI double __floatdidf(di_int a) { // flags to set, and we don't want to code-gen to an unknown soft-float // implementation. -COMPILER_RT_ABI double __floatdidf(di_int a) { - if (a == 0) - return 0.0; - const unsigned N = sizeof(di_int) * CHAR_BIT; - const di_int s = a >> (N - 1); - a = (a ^ s) - s; - int sd = N - __builtin_clzll(a); // number of significant digits - int e = sd - 1; // exponent - if (sd > DBL_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit DBL_MANT_DIG-1 bits to the right of 1 - // Q = bit DBL_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case DBL_MANT_DIG + 1: - a <<= 1; - break; - case DBL_MANT_DIG + 2: - break; - default: - a = ((du_int)a >> (sd - (DBL_MANT_DIG + 2))) | - ((a & ((du_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to DBL_MANT_DIG or DBL_MANT_DIG+1 bits - if (a & ((du_int)1 << DBL_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to DBL_MANT_DIG bits - } else { - a <<= (DBL_MANT_DIG - sd); - // a is now rounded to DBL_MANT_DIG bits - } - double_bits fb; - fb.u.s.high = ((su_int)s & 0x80000000) | // sign - ((su_int)(e + 1023) << 20) | // exponent - ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high - fb.u.s.low = (su_int)a; // mantissa-low - return fb.f; -} +#define SRC_I64 +#define DST_DOUBLE +#include "int_to_fp_impl.inc" + +COMPILER_RT_ABI double __floatdidf(di_int a) { return __floatXiYf__(a); } #endif #if defined(__ARM_EABI__) diff --git a/compiler-rt/lib/builtins/floatdisf.c b/compiler-rt/lib/builtins/floatdisf.c index 5c63164..0bb88c5 100644 --- a/compiler-rt/lib/builtins/floatdisf.c +++ b/compiler-rt/lib/builtins/floatdisf.c @@ -19,52 +19,11 @@ #include "int_lib.h" -COMPILER_RT_ABI float __floatdisf(di_int a) { - if (a == 0) - return 0.0F; - const unsigned N = sizeof(di_int) * CHAR_BIT; - const di_int s = a >> (N - 1); - a = (a ^ s) - s; - int sd = N - __builtin_clzll(a); // number of significant digits - si_int e = sd - 1; // exponent - if (sd > FLT_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit FLT_MANT_DIG-1 bits to the right of 1 - // Q = bit FLT_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case FLT_MANT_DIG + 1: - a <<= 1; - break; - case FLT_MANT_DIG + 2: - break; - default: - a = ((du_int)a >> (sd - (FLT_MANT_DIG + 2))) | - ((a & ((du_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits - if (a & ((du_int)1 << FLT_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to FLT_MANT_DIG bits - } else { - a <<= (FLT_MANT_DIG - sd); - // a is now rounded to FLT_MANT_DIG bits - } - float_bits fb; - fb.u = ((su_int)s & 0x80000000) | // sign - ((e + 127) << 23) | // exponent - ((su_int)a & 0x007FFFFF); // mantissa - return fb.f; -} +#define SRC_I64 +#define DST_SINGLE +#include "int_to_fp_impl.inc" + +COMPILER_RT_ABI float __floatdisf(di_int a) { return __floatXiYf__(a); } #if defined(__ARM_EABI__) #if defined(COMPILER_RT_ARMHF_TARGET) diff --git a/compiler-rt/lib/builtins/floatditf.c b/compiler-rt/lib/builtins/floatditf.c index 9b07b65..c6e326a 100644 --- a/compiler-rt/lib/builtins/floatditf.c +++ b/compiler-rt/lib/builtins/floatditf.c @@ -15,7 +15,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) COMPILER_RT_ABI fp_t __floatditf(di_int a) { const int aWidth = sizeof a * CHAR_BIT; diff --git a/compiler-rt/lib/builtins/floatdixf.c b/compiler-rt/lib/builtins/floatdixf.c index ad5deb2..3d9e664 100644 --- a/compiler-rt/lib/builtins/floatdixf.c +++ b/compiler-rt/lib/builtins/floatdixf.c @@ -23,7 +23,7 @@ // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm // mmmm mmmm mmmm -COMPILER_RT_ABI long double __floatdixf(di_int a) { +COMPILER_RT_ABI xf_float __floatdixf(di_int a) { if (a == 0) return 0.0; const unsigned N = sizeof(di_int) * CHAR_BIT; @@ -31,7 +31,7 @@ COMPILER_RT_ABI long double __floatdixf(di_int a) { a = (a ^ s) - s; int clz = __builtin_clzll(a); int e = (N - 1) - clz; // exponent - long_double_bits fb; + xf_bits fb; fb.u.high.s.low = ((su_int)s & 0x00008000) | // sign (e + 16383); // exponent fb.u.low.all = a << clz; // mantissa diff --git a/compiler-rt/lib/builtins/floatsitf.c b/compiler-rt/lib/builtins/floatsitf.c index 92f207a..314a8a7 100644 --- a/compiler-rt/lib/builtins/floatsitf.c +++ b/compiler-rt/lib/builtins/floatsitf.c @@ -15,7 +15,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) COMPILER_RT_ABI fp_t __floatsitf(si_int a) { const int aWidth = sizeof a * CHAR_BIT; diff --git a/compiler-rt/lib/builtins/floattidf.c b/compiler-rt/lib/builtins/floattidf.c index 0a1c04b..ef8fe18 100644 --- a/compiler-rt/lib/builtins/floattidf.c +++ b/compiler-rt/lib/builtins/floattidf.c @@ -14,6 +14,10 @@ #ifdef CRT_HAS_128BIT +#define SRC_I128 +#define DST_DOUBLE +#include "int_to_fp_impl.inc" + // Returns: convert a to a double, rounding toward even. // Assumption: double is a IEEE 64 bit floating point type @@ -22,52 +26,6 @@ // seee eeee eeee mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm // mmmm -COMPILER_RT_ABI double __floattidf(ti_int a) { - if (a == 0) - return 0.0; - const unsigned N = sizeof(ti_int) * CHAR_BIT; - const ti_int s = a >> (N - 1); - a = (a ^ s) - s; - int sd = N - __clzti2(a); // number of significant digits - int e = sd - 1; // exponent - if (sd > DBL_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit DBL_MANT_DIG-1 bits to the right of 1 - // Q = bit DBL_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case DBL_MANT_DIG + 1: - a <<= 1; - break; - case DBL_MANT_DIG + 2: - break; - default: - a = ((tu_int)a >> (sd - (DBL_MANT_DIG + 2))) | - ((a & ((tu_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to DBL_MANT_DIG or DBL_MANT_DIG+1 bits - if (a & ((tu_int)1 << DBL_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to DBL_MANT_DIG bits - } else { - a <<= (DBL_MANT_DIG - sd); - // a is now rounded to DBL_MANT_DIG bits - } - double_bits fb; - fb.u.s.high = ((su_int)s & 0x80000000) | // sign - ((e + 1023) << 20) | // exponent - ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high - fb.u.s.low = (su_int)a; // mantissa-low - return fb.f; -} +COMPILER_RT_ABI double __floattidf(ti_int a) { return __floatXiYf__(a); } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/builtins/floattisf.c b/compiler-rt/lib/builtins/floattisf.c index a8fcdbe..7758990 100644 --- a/compiler-rt/lib/builtins/floattisf.c +++ b/compiler-rt/lib/builtins/floattisf.c @@ -14,6 +14,10 @@ #ifdef CRT_HAS_128BIT +#define SRC_I128 +#define DST_SINGLE +#include "int_to_fp_impl.inc" + // Returns: convert a to a float, rounding toward even. // Assumption: float is a IEEE 32 bit floating point type @@ -21,51 +25,6 @@ // seee eeee emmm mmmm mmmm mmmm mmmm mmmm -COMPILER_RT_ABI float __floattisf(ti_int a) { - if (a == 0) - return 0.0F; - const unsigned N = sizeof(ti_int) * CHAR_BIT; - const ti_int s = a >> (N - 1); - a = (a ^ s) - s; - int sd = N - __clzti2(a); // number of significant digits - int e = sd - 1; // exponent - if (sd > FLT_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit FLT_MANT_DIG-1 bits to the right of 1 - // Q = bit FLT_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case FLT_MANT_DIG + 1: - a <<= 1; - break; - case FLT_MANT_DIG + 2: - break; - default: - a = ((tu_int)a >> (sd - (FLT_MANT_DIG + 2))) | - ((a & ((tu_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits - if (a & ((tu_int)1 << FLT_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to FLT_MANT_DIG bits - } else { - a <<= (FLT_MANT_DIG - sd); - // a is now rounded to FLT_MANT_DIG bits - } - float_bits fb; - fb.u = ((su_int)s & 0x80000000) | // sign - ((e + 127) << 23) | // exponent - ((su_int)a & 0x007FFFFF); // mantissa - return fb.f; -} +COMPILER_RT_ABI float __floattisf(ti_int a) { return __floatXiYf__(a); } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/builtins/floattitf.c b/compiler-rt/lib/builtins/floattitf.c index 196cbda..5dffe22 100644 --- a/compiler-rt/lib/builtins/floattitf.c +++ b/compiler-rt/lib/builtins/floattitf.c @@ -16,6 +16,11 @@ #include "fp_lib.h" #include "int_lib.h" +#if defined(CRT_HAS_TF_MODE) +#define SRC_I128 +#define DST_QUAD +#include "int_to_fp_impl.inc" + // Returns: convert a ti_int to a fp_t, rounding toward even. // Assumption: fp_t is a IEEE 128 bit floating point type @@ -25,54 +30,6 @@ // mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm // mmmm mmmm mmmm -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) -COMPILER_RT_ABI fp_t __floattitf(ti_int a) { - if (a == 0) - return 0.0; - const unsigned N = sizeof(ti_int) * CHAR_BIT; - const ti_int s = a >> (N - 1); - a = (a ^ s) - s; - int sd = N - __clzti2(a); // number of significant digits - int e = sd - 1; // exponent - if (sd > LDBL_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit LDBL_MANT_DIG-1 bits to the right of 1 - // Q = bit LDBL_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case LDBL_MANT_DIG + 1: - a <<= 1; - break; - case LDBL_MANT_DIG + 2: - break; - default: - a = ((tu_int)a >> (sd - (LDBL_MANT_DIG + 2))) | - ((a & ((tu_int)(-1) >> ((N + LDBL_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to LDBL_MANT_DIG or LDBL_MANT_DIG+1 bits - if (a & ((tu_int)1 << LDBL_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to LDBL_MANT_DIG bits - } else { - a <<= (LDBL_MANT_DIG - sd); - // a is now rounded to LDBL_MANT_DIG bits - } - - long_double_bits fb; - fb.u.high.all = (s & 0x8000000000000000LL) // sign - | (du_int)(e + 16383) << 48 // exponent - | ((a >> 64) & 0x0000ffffffffffffLL); // significand - fb.u.low.all = (du_int)(a); - return fb.f; -} +COMPILER_RT_ABI fp_t __floattitf(ti_int a) { return __floatXiYf__(a); } #endif diff --git a/compiler-rt/lib/builtins/floattixf.c b/compiler-rt/lib/builtins/floattixf.c index 23796f1..c80bc71 100644 --- a/compiler-rt/lib/builtins/floattixf.c +++ b/compiler-rt/lib/builtins/floattixf.c @@ -23,7 +23,7 @@ // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm // mmmm mmmm mmmm -COMPILER_RT_ABI long double __floattixf(ti_int a) { +COMPILER_RT_ABI xf_float __floattixf(ti_int a) { if (a == 0) return 0.0; const unsigned N = sizeof(ti_int) * CHAR_BIT; @@ -63,7 +63,7 @@ COMPILER_RT_ABI long double __floattixf(ti_int a) { a <<= (LDBL_MANT_DIG - sd); // a is now rounded to LDBL_MANT_DIG bits } - long_double_bits fb; + xf_bits fb; fb.u.high.s.low = ((su_int)s & 0x8000) | // sign (e + 16383); // exponent fb.u.low.all = (du_int)a; // mantissa diff --git a/compiler-rt/lib/builtins/floatundidf.c b/compiler-rt/lib/builtins/floatundidf.c index 2ec802c..9743e96 100644 --- a/compiler-rt/lib/builtins/floatundidf.c +++ b/compiler-rt/lib/builtins/floatundidf.c @@ -51,50 +51,11 @@ COMPILER_RT_ABI double __floatundidf(du_int a) { // flags to set, and we don't want to code-gen to an unknown soft-float // implementation. -COMPILER_RT_ABI double __floatundidf(du_int a) { - if (a == 0) - return 0.0; - const unsigned N = sizeof(du_int) * CHAR_BIT; - int sd = N - __builtin_clzll(a); // number of significant digits - int e = sd - 1; // exponent - if (sd > DBL_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit DBL_MANT_DIG-1 bits to the right of 1 - // Q = bit DBL_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case DBL_MANT_DIG + 1: - a <<= 1; - break; - case DBL_MANT_DIG + 2: - break; - default: - a = (a >> (sd - (DBL_MANT_DIG + 2))) | - ((a & ((du_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to DBL_MANT_DIG or DBL_MANT_DIG+1 bits - if (a & ((du_int)1 << DBL_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to DBL_MANT_DIG bits - } else { - a <<= (DBL_MANT_DIG - sd); - // a is now rounded to DBL_MANT_DIG bits - } - double_bits fb; - fb.u.s.high = ((su_int)(e + 1023) << 20) | // exponent - ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high - fb.u.s.low = (su_int)a; // mantissa-low - return fb.f; -} +#define SRC_U64 +#define DST_DOUBLE +#include "int_to_fp_impl.inc" + +COMPILER_RT_ABI double __floatundidf(du_int a) { return __floatXiYf__(a); } #endif #if defined(__ARM_EABI__) diff --git a/compiler-rt/lib/builtins/floatundisf.c b/compiler-rt/lib/builtins/floatundisf.c index 2a4157d..d4b418e 100644 --- a/compiler-rt/lib/builtins/floatundisf.c +++ b/compiler-rt/lib/builtins/floatundisf.c @@ -19,49 +19,11 @@ #include "int_lib.h" -COMPILER_RT_ABI float __floatundisf(du_int a) { - if (a == 0) - return 0.0F; - const unsigned N = sizeof(du_int) * CHAR_BIT; - int sd = N - __builtin_clzll(a); // number of significant digits - si_int e = sd - 1; // 8 exponent - if (sd > FLT_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit FLT_MANT_DIG-1 bits to the right of 1 - // Q = bit FLT_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case FLT_MANT_DIG + 1: - a <<= 1; - break; - case FLT_MANT_DIG + 2: - break; - default: - a = (a >> (sd - (FLT_MANT_DIG + 2))) | - ((a & ((du_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits - if (a & ((du_int)1 << FLT_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to FLT_MANT_DIG bits - } else { - a <<= (FLT_MANT_DIG - sd); - // a is now rounded to FLT_MANT_DIG bits - } - float_bits fb; - fb.u = ((e + 127) << 23) | // exponent - ((su_int)a & 0x007FFFFF); // mantissa - return fb.f; -} +#define SRC_U64 +#define DST_SINGLE +#include "int_to_fp_impl.inc" + +COMPILER_RT_ABI float __floatundisf(du_int a) { return __floatXiYf__(a); } #if defined(__ARM_EABI__) #if defined(COMPILER_RT_ARMHF_TARGET) diff --git a/compiler-rt/lib/builtins/floatunditf.c b/compiler-rt/lib/builtins/floatunditf.c index 8d31085..abe0ca9 100644 --- a/compiler-rt/lib/builtins/floatunditf.c +++ b/compiler-rt/lib/builtins/floatunditf.c @@ -15,7 +15,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) COMPILER_RT_ABI fp_t __floatunditf(du_int a) { const int aWidth = sizeof a * CHAR_BIT; diff --git a/compiler-rt/lib/builtins/floatundixf.c b/compiler-rt/lib/builtins/floatundixf.c index 85264ad..3e3c655 100644 --- a/compiler-rt/lib/builtins/floatundixf.c +++ b/compiler-rt/lib/builtins/floatundixf.c @@ -22,13 +22,13 @@ // gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm // mmmm mmmm mmmm -COMPILER_RT_ABI long double __floatundixf(du_int a) { +COMPILER_RT_ABI xf_float __floatundixf(du_int a) { if (a == 0) return 0.0; const unsigned N = sizeof(du_int) * CHAR_BIT; int clz = __builtin_clzll(a); int e = (N - 1) - clz; // exponent - long_double_bits fb; + xf_bits fb; fb.u.high.s.low = (e + 16383); // exponent fb.u.low.all = a << clz; // mantissa return fb.f; diff --git a/compiler-rt/lib/builtins/floatunsitf.c b/compiler-rt/lib/builtins/floatunsitf.c index 7ba1fb6..3f0a524 100644 --- a/compiler-rt/lib/builtins/floatunsitf.c +++ b/compiler-rt/lib/builtins/floatunsitf.c @@ -15,7 +15,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) COMPILER_RT_ABI fp_t __floatunsitf(su_int a) { const int aWidth = sizeof a * CHAR_BIT; diff --git a/compiler-rt/lib/builtins/floatuntidf.c b/compiler-rt/lib/builtins/floatuntidf.c index e69e65c..9abeacc 100644 --- a/compiler-rt/lib/builtins/floatuntidf.c +++ b/compiler-rt/lib/builtins/floatuntidf.c @@ -14,6 +14,10 @@ #ifdef CRT_HAS_128BIT +#define SRC_U128 +#define DST_DOUBLE +#include "int_to_fp_impl.inc" + // Returns: convert a to a double, rounding toward even. // Assumption: double is a IEEE 64 bit floating point type @@ -22,49 +26,6 @@ // seee eeee eeee mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm // mmmm -COMPILER_RT_ABI double __floatuntidf(tu_int a) { - if (a == 0) - return 0.0; - const unsigned N = sizeof(tu_int) * CHAR_BIT; - int sd = N - __clzti2(a); // number of significant digits - int e = sd - 1; // exponent - if (sd > DBL_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit DBL_MANT_DIG-1 bits to the right of 1 - // Q = bit DBL_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case DBL_MANT_DIG + 1: - a <<= 1; - break; - case DBL_MANT_DIG + 2: - break; - default: - a = (a >> (sd - (DBL_MANT_DIG + 2))) | - ((a & ((tu_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to DBL_MANT_DIG or DBL_MANT_DIG+1 bits - if (a & ((tu_int)1 << DBL_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to DBL_MANT_DIG bits - } else { - a <<= (DBL_MANT_DIG - sd); - // a is now rounded to DBL_MANT_DIG bits - } - double_bits fb; - fb.u.s.high = ((e + 1023) << 20) | // exponent - ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high - fb.u.s.low = (su_int)a; // mantissa-low - return fb.f; -} +COMPILER_RT_ABI double __floatuntidf(tu_int a) { return __floatXiYf__(a); } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/builtins/floatuntisf.c b/compiler-rt/lib/builtins/floatuntisf.c index 9dec0ab..997c156 100644 --- a/compiler-rt/lib/builtins/floatuntisf.c +++ b/compiler-rt/lib/builtins/floatuntisf.c @@ -14,6 +14,10 @@ #ifdef CRT_HAS_128BIT +#define SRC_U128 +#define DST_SINGLE +#include "int_to_fp_impl.inc" + // Returns: convert a to a float, rounding toward even. // Assumption: float is a IEEE 32 bit floating point type @@ -21,48 +25,6 @@ // seee eeee emmm mmmm mmmm mmmm mmmm mmmm -COMPILER_RT_ABI float __floatuntisf(tu_int a) { - if (a == 0) - return 0.0F; - const unsigned N = sizeof(tu_int) * CHAR_BIT; - int sd = N - __clzti2(a); // number of significant digits - int e = sd - 1; // exponent - if (sd > FLT_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit FLT_MANT_DIG-1 bits to the right of 1 - // Q = bit FLT_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case FLT_MANT_DIG + 1: - a <<= 1; - break; - case FLT_MANT_DIG + 2: - break; - default: - a = (a >> (sd - (FLT_MANT_DIG + 2))) | - ((a & ((tu_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits - if (a & ((tu_int)1 << FLT_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to FLT_MANT_DIG bits - } else { - a <<= (FLT_MANT_DIG - sd); - // a is now rounded to FLT_MANT_DIG bits - } - float_bits fb; - fb.u = ((e + 127) << 23) | // exponent - ((su_int)a & 0x007FFFFF); // mantissa - return fb.f; -} +COMPILER_RT_ABI float __floatuntisf(tu_int a) { return __floatXiYf__(a); } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/builtins/floatuntitf.c b/compiler-rt/lib/builtins/floatuntitf.c index d308d31..1c5998a 100644 --- a/compiler-rt/lib/builtins/floatuntitf.c +++ b/compiler-rt/lib/builtins/floatuntitf.c @@ -16,6 +16,11 @@ #include "fp_lib.h" #include "int_lib.h" +#if defined(CRT_HAS_TF_MODE) +#define SRC_U128 +#define DST_QUAD +#include "int_to_fp_impl.inc" + // Returns: convert a tu_int to a fp_t, rounding toward even. // Assumption: fp_t is a IEEE 128 bit floating point type @@ -25,51 +30,6 @@ // mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm // mmmm mmmm mmmm -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) -COMPILER_RT_ABI fp_t __floatuntitf(tu_int a) { - if (a == 0) - return 0.0; - const unsigned N = sizeof(tu_int) * CHAR_BIT; - int sd = N - __clzti2(a); // number of significant digits - int e = sd - 1; // exponent - if (sd > LDBL_MANT_DIG) { - // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx - // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR - // 12345678901234567890123456 - // 1 = msb 1 bit - // P = bit LDBL_MANT_DIG-1 bits to the right of 1 - // Q = bit LDBL_MANT_DIG bits to the right of 1 - // R = "or" of all bits to the right of Q - switch (sd) { - case LDBL_MANT_DIG + 1: - a <<= 1; - break; - case LDBL_MANT_DIG + 2: - break; - default: - a = (a >> (sd - (LDBL_MANT_DIG + 2))) | - ((a & ((tu_int)(-1) >> ((N + LDBL_MANT_DIG + 2) - sd))) != 0); - }; - // finish: - a |= (a & 4) != 0; // Or P into R - ++a; // round - this step may add a significant bit - a >>= 2; // dump Q and R - // a is now rounded to LDBL_MANT_DIG or LDBL_MANT_DIG+1 bits - if (a & ((tu_int)1 << LDBL_MANT_DIG)) { - a >>= 1; - ++e; - } - // a is now rounded to LDBL_MANT_DIG bits - } else { - a <<= (LDBL_MANT_DIG - sd); - // a is now rounded to LDBL_MANT_DIG bits - } - - long_double_bits fb; - fb.u.high.all = (du_int)(e + 16383) << 48 // exponent - | ((a >> 64) & 0x0000ffffffffffffLL); // significand - fb.u.low.all = (du_int)(a); - return fb.f; -} +COMPILER_RT_ABI fp_t __floatuntitf(tu_int a) { return __floatXiYf__(a); } #endif diff --git a/compiler-rt/lib/builtins/floatuntixf.c b/compiler-rt/lib/builtins/floatuntixf.c index efd8a27..4c53775 100644 --- a/compiler-rt/lib/builtins/floatuntixf.c +++ b/compiler-rt/lib/builtins/floatuntixf.c @@ -23,7 +23,7 @@ // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm // mmmm mmmm mmmm -COMPILER_RT_ABI long double __floatuntixf(tu_int a) { +COMPILER_RT_ABI xf_float __floatuntixf(tu_int a) { if (a == 0) return 0.0; const unsigned N = sizeof(tu_int) * CHAR_BIT; @@ -61,7 +61,7 @@ COMPILER_RT_ABI long double __floatuntixf(tu_int a) { a <<= (LDBL_MANT_DIG - sd); // a is now rounded to LDBL_MANT_DIG bits } - long_double_bits fb; + xf_bits fb; fb.u.high.s.low = (e + 16383); // exponent fb.u.low.all = (du_int)a; // mantissa return fb.f; diff --git a/compiler-rt/lib/builtins/fp_add_impl.inc b/compiler-rt/lib/builtins/fp_add_impl.inc index 7133358..d205999 100644 --- a/compiler-rt/lib/builtins/fp_add_impl.inc +++ b/compiler-rt/lib/builtins/fp_add_impl.inc @@ -91,7 +91,7 @@ static __inline fp_t __addXf3__(fp_t a, fp_t b) { // Shift the significand of b by the difference in exponents, with a sticky // bottom bit to get rounding correct. - const unsigned int align = aExponent - bExponent; + const unsigned int align = (unsigned int)(aExponent - bExponent); if (align) { if (align < typeWidth) { const bool sticky = (bSignificand << (typeWidth - align)) != 0; diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h index eee4722..22bf2b2 100644 --- a/compiler-rt/lib/builtins/fp_extend.h +++ b/compiler-rt/lib/builtins/fp_extend.h @@ -20,24 +20,37 @@ typedef float src_t; typedef uint32_t src_rep_t; #define SRC_REP_C UINT32_C -static const int srcSigBits = 23; +static const int srcBits = sizeof(src_t) * CHAR_BIT; +static const int srcSigFracBits = 23; +// -1 accounts for the sign bit. +// srcBits - srcSigFracBits - 1 +static const int srcExpBits = 8; #define src_rep_t_clz clzsi #elif defined SRC_DOUBLE typedef double src_t; typedef uint64_t src_rep_t; #define SRC_REP_C UINT64_C -static const int srcSigBits = 52; -static __inline int src_rep_t_clz(src_rep_t a) { -#if defined __LP64__ - return __builtin_clzl(a); -#else - if (a & REP_C(0xffffffff00000000)) - return clzsi(a >> 32); - else - return 32 + clzsi(a & REP_C(0xffffffff)); -#endif -} +static const int srcBits = sizeof(src_t) * CHAR_BIT; +static const int srcSigFracBits = 52; +// -1 accounts for the sign bit. +// srcBits - srcSigFracBits - 1 +static const int srcExpBits = 11; + +static inline int src_rep_t_clz_impl(src_rep_t a) { return __builtin_clzll(a); } +#define src_rep_t_clz src_rep_t_clz_impl + +#elif defined SRC_80 +typedef xf_float src_t; +typedef __uint128_t src_rep_t; +#define SRC_REP_C (__uint128_t) +// sign bit, exponent and significand occupy the lower 80 bits. +static const int srcBits = 80; +static const int srcSigFracBits = 63; +// -1 accounts for the sign bit. +// -1 accounts for the explicitly stored integer bit. +// srcBits - srcSigFracBits - 1 - 1 +static const int srcExpBits = 15; #elif defined SRC_HALF #ifdef COMPILER_RT_HAS_FLOAT16 @@ -47,7 +60,31 @@ typedef uint16_t src_t; #endif typedef uint16_t src_rep_t; #define SRC_REP_C UINT16_C -static const int srcSigBits = 10; +static const int srcBits = sizeof(src_t) * CHAR_BIT; +static const int srcSigFracBits = 10; +// -1 accounts for the sign bit. +// srcBits - srcSigFracBits - 1 +static const int srcExpBits = 5; + +static inline int src_rep_t_clz_impl(src_rep_t a) { + return __builtin_clz(a) - 16; +} + +#define src_rep_t_clz src_rep_t_clz_impl + +#elif defined SRC_BFLOAT16 +#ifdef COMPILER_RT_HAS_BFLOAT16 +typedef __bf16 src_t; +#else +typedef uint16_t src_t; +#endif +typedef uint16_t src_rep_t; +#define SRC_REP_C UINT16_C +static const int srcBits = sizeof(src_t) * CHAR_BIT; +static const int srcSigFracBits = 7; +// -1 accounts for the sign bit. +// srcBits - srcSigFracBits - 1 +static const int srcExpBits = 8; #define src_rep_t_clz __builtin_clz #else @@ -58,28 +95,72 @@ static const int srcSigBits = 10; typedef float dst_t; typedef uint32_t dst_rep_t; #define DST_REP_C UINT32_C -static const int dstSigBits = 23; +static const int dstBits = sizeof(dst_t) * CHAR_BIT; +static const int dstSigFracBits = 23; +// -1 accounts for the sign bit. +// dstBits - dstSigFracBits - 1 +static const int dstExpBits = 8; #elif defined DST_DOUBLE typedef double dst_t; typedef uint64_t dst_rep_t; #define DST_REP_C UINT64_C -static const int dstSigBits = 52; +static const int dstBits = sizeof(dst_t) * CHAR_BIT; +static const int dstSigFracBits = 52; +// -1 accounts for the sign bit. +// dstBits - dstSigFracBits - 1 +static const int dstExpBits = 11; #elif defined DST_QUAD -typedef long double dst_t; +typedef tf_float dst_t; typedef __uint128_t dst_rep_t; #define DST_REP_C (__uint128_t) -static const int dstSigBits = 112; +static const int dstBits = sizeof(dst_t) * CHAR_BIT; +static const int dstSigFracBits = 112; +// -1 accounts for the sign bit. +// dstBits - dstSigFracBits - 1 +static const int dstExpBits = 15; #else #error Destination should be single, double, or quad precision! #endif // end destination precision -// End of specialization parameters. Two helper routines for conversion to and -// from the representation of floating-point data as integer values follow. +// End of specialization parameters. + +// TODO: These helper routines should be placed into fp_lib.h +// Currently they depend on macros/constants defined above. + +static inline src_rep_t extract_sign_from_src(src_rep_t x) { + const src_rep_t srcSignMask = SRC_REP_C(1) << (srcBits - 1); + return (x & srcSignMask) >> (srcBits - 1); +} + +static inline src_rep_t extract_exp_from_src(src_rep_t x) { + const int srcSigBits = srcBits - 1 - srcExpBits; + const src_rep_t srcExpMask = ((SRC_REP_C(1) << srcExpBits) - 1) << srcSigBits; + return (x & srcExpMask) >> srcSigBits; +} + +static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) { + const src_rep_t srcSigFracMask = (SRC_REP_C(1) << srcSigFracBits) - 1; + return x & srcSigFracMask; +} + +#ifdef src_rep_t_clz +static inline int clz_in_sig_frac(src_rep_t sigFrac) { + const int skip = 1 + srcExpBits; + return src_rep_t_clz(sigFrac) - skip; +} +#endif + +static inline dst_rep_t construct_dst_rep(dst_rep_t sign, dst_rep_t exp, dst_rep_t sigFrac) { + return (sign << (dstBits - 1)) | (exp << (dstBits - 1 - dstExpBits)) | sigFrac; +} + +// Two helper routines for conversion to and from the representation of +// floating-point data as integer values follow. -static __inline src_rep_t srcToRep(src_t x) { +static inline src_rep_t srcToRep(src_t x) { const union { src_t f; src_rep_t i; @@ -87,7 +168,7 @@ static __inline src_rep_t srcToRep(src_t x) { return rep.i; } -static __inline dst_t dstFromRep(dst_rep_t x) { +static inline dst_t dstFromRep(dst_rep_t x) { const union { dst_t f; dst_rep_t i; diff --git a/compiler-rt/lib/builtins/fp_extend_impl.inc b/compiler-rt/lib/builtins/fp_extend_impl.inc index d1c9c02..f4f6630 100644 --- a/compiler-rt/lib/builtins/fp_extend_impl.inc +++ b/compiler-rt/lib/builtins/fp_extend_impl.inc @@ -37,71 +37,72 @@ #include "fp_extend.h" +// The source type may use a usual IEEE-754 interchange format or Intel 80-bit +// format. In particular, for the source type srcSigFracBits may be not equal to +// srcSigBits. The destination type is assumed to be one of IEEE-754 standard +// types. static __inline dst_t __extendXfYf2__(src_t a) { // Various constants whose values follow from the type parameters. // Any reasonable optimizer will fold and propagate all of these. - const int srcBits = sizeof(src_t) * CHAR_BIT; - const int srcExpBits = srcBits - srcSigBits - 1; const int srcInfExp = (1 << srcExpBits) - 1; const int srcExpBias = srcInfExp >> 1; - const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigBits; - const src_rep_t srcInfinity = (src_rep_t)srcInfExp << srcSigBits; - const src_rep_t srcSignMask = SRC_REP_C(1) << (srcSigBits + srcExpBits); - const src_rep_t srcAbsMask = srcSignMask - 1; - const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigBits - 1); - const src_rep_t srcNaNCode = srcQNaN - 1; - - const int dstBits = sizeof(dst_t) * CHAR_BIT; - const int dstExpBits = dstBits - dstSigBits - 1; const int dstInfExp = (1 << dstExpBits) - 1; const int dstExpBias = dstInfExp >> 1; - const dst_rep_t dstMinNormal = DST_REP_C(1) << dstSigBits; - // Break a into a sign and representation of the absolute value. const src_rep_t aRep = srcToRep(a); - const src_rep_t aAbs = aRep & srcAbsMask; - const src_rep_t sign = aRep & srcSignMask; - dst_rep_t absResult; + const src_rep_t srcSign = extract_sign_from_src(aRep); + const src_rep_t srcExp = extract_exp_from_src(aRep); + const src_rep_t srcSigFrac = extract_sig_frac_from_src(aRep); + + dst_rep_t dstSign = srcSign; + dst_rep_t dstExp; + dst_rep_t dstSigFrac; - // If sizeof(src_rep_t) < sizeof(int), the subtraction result is promoted - // to (signed) int. To avoid that, explicitly cast to src_rep_t. - if ((src_rep_t)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) { + if (srcExp >= 1 && srcExp < (src_rep_t)srcInfExp) { // a is a normal number. - // Extend to the destination type by shifting the significand and - // exponent into the proper position and rebiasing the exponent. - absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits); - absResult += (dst_rep_t)(dstExpBias - srcExpBias) << dstSigBits; + dstExp = (dst_rep_t)srcExp + (dst_rep_t)(dstExpBias - srcExpBias); + dstSigFrac = (dst_rep_t)srcSigFrac << (dstSigFracBits - srcSigFracBits); } - else if (aAbs >= srcInfinity) { + else if (srcExp == srcInfExp) { // a is NaN or infinity. - // Conjure the result by beginning with infinity, then setting the qNaN - // bit (if needed) and right-aligning the rest of the trailing NaN - // payload field. - absResult = (dst_rep_t)dstInfExp << dstSigBits; - absResult |= (dst_rep_t)(aAbs & srcQNaN) << (dstSigBits - srcSigBits); - absResult |= (dst_rep_t)(aAbs & srcNaNCode) << (dstSigBits - srcSigBits); + dstExp = dstInfExp; + dstSigFrac = (dst_rep_t)srcSigFrac << (dstSigFracBits - srcSigFracBits); } - else if (aAbs) { + else if (srcSigFrac) { // a is denormal. - // renormalize the significand and clear the leading bit, then insert - // the correct adjusted exponent in the destination type. - const int scale = src_rep_t_clz(aAbs) - src_rep_t_clz(srcMinNormal); - absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits + scale); - absResult ^= dstMinNormal; - const int resultExponent = dstExpBias - srcExpBias - scale + 1; - absResult |= (dst_rep_t)resultExponent << dstSigBits; + if (srcExpBits == dstExpBits) { + // The exponent fields are identical and this is a denormal number, so all + // the non-significand bits are zero. In particular, this branch is always + // taken when we extend a denormal F80 to F128. + dstExp = 0; + dstSigFrac = ((dst_rep_t)srcSigFrac) << (dstSigFracBits - srcSigFracBits); + } else { +#ifndef src_rep_t_clz + // If src_rep_t_clz is not defined this branch must be unreachable. + __builtin_unreachable(); +#else + // Renormalize the significand and clear the leading bit. + // For F80 -> F128 this codepath is unused. + const int scale = clz_in_sig_frac(srcSigFrac) + 1; + dstExp = dstExpBias - srcExpBias - scale + 1; + dstSigFrac = (dst_rep_t)srcSigFrac + << (dstSigFracBits - srcSigFracBits + scale); + const dst_rep_t dstMinNormal = DST_REP_C(1) << (dstBits - 1 - dstExpBits); + dstSigFrac ^= dstMinNormal; +#endif + } } else { // a is zero. - absResult = 0; + dstExp = 0; + dstSigFrac = 0; } - // Apply the signbit to the absolute value. - const dst_rep_t result = absResult | (dst_rep_t)sign << (dstBits - srcBits); + const dst_rep_t result = construct_dst_rep(dstSign, dstExp, dstSigFrac); return dstFromRep(result); } diff --git a/compiler-rt/lib/builtins/fp_fixint_impl.inc b/compiler-rt/lib/builtins/fp_fixint_impl.inc index 2196d71..2f2f77c 100644 --- a/compiler-rt/lib/builtins/fp_fixint_impl.inc +++ b/compiler-rt/lib/builtins/fp_fixint_impl.inc @@ -34,7 +34,7 @@ static __inline fixint_t __fixint(fp_t a) { // If 0 <= exponent < significandBits, right shift to get the result. // Otherwise, shift left. if (exponent < significandBits) - return sign * (significand >> (significandBits - exponent)); + return (fixint_t)(sign * (significand >> (significandBits - exponent))); else - return sign * ((fixint_t)significand << (exponent - significandBits)); + return (fixint_t)(sign * ((fixuint_t)significand << (exponent - significandBits))); } diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h index 3fb13a0..b2a8950 100644 --- a/compiler-rt/lib/builtins/fp_lib.h +++ b/compiler-rt/lib/builtins/fp_lib.h @@ -22,22 +22,11 @@ #include "int_lib.h" #include "int_math.h" +#include "int_types.h" #include #include #include -// x86_64 FreeBSD prior v9.3 define fixed-width types incorrectly in -// 32-bit mode. -#if defined(__FreeBSD__) && defined(__i386__) -#include -#if __FreeBSD_version < 903000 // v9.3 -#define uint64_t unsigned long long -#define int64_t long long -#undef UINT64_C -#define UINT64_C(c) (c##ULL) -#endif -#endif - #if defined SINGLE_PRECISION typedef uint16_t half_rep_t; @@ -54,8 +43,8 @@ static __inline int rep_clz(rep_t a) { return clzsi(a); } // 32x32 --> 64 bit multiply static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { const uint64_t product = (uint64_t)a * b; - *hi = product >> 32; - *lo = product; + *hi = (rep_t)(product >> 32); + *lo = (rep_t)product; } COMPILER_RT_ABI fp_t __addsf3(fp_t a, fp_t b); @@ -69,16 +58,7 @@ typedef double fp_t; #define REP_C UINT64_C #define significandBits 52 -static __inline int rep_clz(rep_t a) { -#if defined __LP64__ - return __builtin_clzl(a); -#else - if (a & REP_C(0xffffffff00000000)) - return clzsi(a >> 32); - else - return 32 + clzsi(a & REP_C(0xffffffff)); -#endif -} +static inline int rep_clz(rep_t a) { return __builtin_clzll(a); } #define loWord(a) (a & 0xffffffffU) #define hiWord(a) (a >> 32) @@ -105,17 +85,18 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { COMPILER_RT_ABI fp_t __adddf3(fp_t a, fp_t b); #elif defined QUAD_PRECISION -#if __LDBL_MANT_DIG__ == 113 && defined(__SIZEOF_INT128__) -#define CRT_LDBL_128BIT +#if defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT) typedef uint64_t half_rep_t; typedef __uint128_t rep_t; typedef __int128_t srep_t; -typedef long double fp_t; +typedef tf_float fp_t; #define HALF_REP_C UINT64_C #define REP_C (__uint128_t) +#if defined(CRT_HAS_IEEE_TF) // Note: Since there is no explicit way to tell compiler the constant is a // 128-bit integer, we let the constant be casted to 128-bit integer #define significandBits 112 +#define TF_MANT_DIG (significandBits + 1) static __inline int rep_clz(rep_t a) { const union { @@ -200,27 +181,17 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) { #undef Word_HiMask #undef Word_LoMask #undef Word_FullMask -#endif // __LDBL_MANT_DIG__ == 113 && __SIZEOF_INT128__ +#endif // defined(CRT_HAS_IEEE_TF) +#else +typedef long double fp_t; +#endif // defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT) #else #error SINGLE_PRECISION, DOUBLE_PRECISION or QUAD_PRECISION must be defined. #endif #if defined(SINGLE_PRECISION) || defined(DOUBLE_PRECISION) || \ - defined(CRT_LDBL_128BIT) + (defined(QUAD_PRECISION) && defined(CRT_HAS_TF_MODE)) #define typeWidth (sizeof(rep_t) * CHAR_BIT) -#define exponentBits (typeWidth - significandBits - 1) -#define maxExponent ((1 << exponentBits) - 1) -#define exponentBias (maxExponent >> 1) - -#define implicitBit (REP_C(1) << significandBits) -#define significandMask (implicitBit - 1U) -#define signBit (REP_C(1) << (significandBits + exponentBits)) -#define absMask (signBit - 1U) -#define exponentMask (absMask ^ significandMask) -#define oneRep ((rep_t)exponentBias << significandBits) -#define infRep exponentMask -#define quietBit (implicitBit >> 1) -#define qnanRep (exponentMask | quietBit) static __inline rep_t toRep(fp_t x) { const union { @@ -238,13 +209,28 @@ static __inline fp_t fromRep(rep_t x) { return rep.f; } +#if !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF) +#define exponentBits (typeWidth - significandBits - 1) +#define maxExponent ((1 << exponentBits) - 1) +#define exponentBias (maxExponent >> 1) + +#define implicitBit (REP_C(1) << significandBits) +#define significandMask (implicitBit - 1U) +#define signBit (REP_C(1) << (significandBits + exponentBits)) +#define absMask (signBit - 1U) +#define exponentMask (absMask ^ significandMask) +#define oneRep ((rep_t)exponentBias << significandBits) +#define infRep exponentMask +#define quietBit (implicitBit >> 1) +#define qnanRep (exponentMask | quietBit) + static __inline int normalize(rep_t *significand) { const int shift = rep_clz(*significand) - rep_clz(implicitBit); *significand <<= shift; return 1 - shift; } -static __inline void wideLeftShift(rep_t *hi, rep_t *lo, int count) { +static __inline void wideLeftShift(rep_t *hi, rep_t *lo, unsigned int count) { *hi = *hi << count | *lo >> (typeWidth - count); *lo = *lo << count; } @@ -340,6 +326,8 @@ static __inline fp_t __compiler_rt_scalbnX(fp_t x, int y) { return fromRep(sign | ((rep_t)exp << significandBits) | sig); } +#endif // !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF) + // Avoid using fmax from libm. static __inline fp_t __compiler_rt_fmaxX(fp_t x, fp_t y) { // If either argument is NaN, return the other argument. If both are NaN, @@ -386,31 +374,42 @@ static __inline fp_t __compiler_rt_fmax(fp_t x, fp_t y) { #endif } -#elif defined(QUAD_PRECISION) - -#if defined(CRT_LDBL_128BIT) -static __inline fp_t __compiler_rt_logbl(fp_t x) { +#elif defined(QUAD_PRECISION) && defined(CRT_HAS_TF_MODE) +// The generic implementation only works for ieee754 floating point. For other +// floating point types, continue to rely on the libm implementation for now. +#if defined(CRT_HAS_IEEE_TF) +static __inline tf_float __compiler_rt_logbtf(tf_float x) { return __compiler_rt_logbX(x); } -static __inline fp_t __compiler_rt_scalbnl(fp_t x, int y) { +static __inline tf_float __compiler_rt_scalbntf(tf_float x, int y) { return __compiler_rt_scalbnX(x, y); } -static __inline fp_t __compiler_rt_fmaxl(fp_t x, fp_t y) { +static __inline tf_float __compiler_rt_fmaxtf(tf_float x, tf_float y) { return __compiler_rt_fmaxX(x, y); } -#else -// The generic implementation only works for ieee754 floating point. For other -// floating point types, continue to rely on the libm implementation for now. -static __inline long double __compiler_rt_logbl(long double x) { +#define __compiler_rt_logbl __compiler_rt_logbtf +#define __compiler_rt_scalbnl __compiler_rt_scalbntf +#define __compiler_rt_fmaxl __compiler_rt_fmaxtf +#define crt_fabstf crt_fabsf128 +#define crt_copysigntf crt_copysignf128 +#elif defined(CRT_LDBL_128BIT) +static __inline tf_float __compiler_rt_logbtf(tf_float x) { return crt_logbl(x); } -static __inline long double __compiler_rt_scalbnl(long double x, int y) { +static __inline tf_float __compiler_rt_scalbntf(tf_float x, int y) { return crt_scalbnl(x, y); } -static __inline long double __compiler_rt_fmaxl(long double x, long double y) { +static __inline tf_float __compiler_rt_fmaxtf(tf_float x, tf_float y) { return crt_fmaxl(x, y); } -#endif // CRT_LDBL_128BIT +#define __compiler_rt_logbl crt_logbl +#define __compiler_rt_scalbnl crt_scalbnl +#define __compiler_rt_fmaxl crt_fmaxl +#define crt_fabstf crt_fabsl +#define crt_copysigntf crt_copysignl +#else +#error Unsupported TF mode type +#endif #endif // *_PRECISION diff --git a/compiler-rt/lib/builtins/fp_trunc.h b/compiler-rt/lib/builtins/fp_trunc.h index 91f6145..141fe63 100644 --- a/compiler-rt/lib/builtins/fp_trunc.h +++ b/compiler-rt/lib/builtins/fp_trunc.h @@ -19,19 +19,31 @@ typedef float src_t; typedef uint32_t src_rep_t; #define SRC_REP_C UINT32_C -static const int srcSigBits = 23; +static const int srcBits = sizeof(src_t) * CHAR_BIT; +static const int srcSigFracBits = 23; +// -1 accounts for the sign bit. +// srcBits - srcSigFracBits - 1 +static const int srcExpBits = 8; #elif defined SRC_DOUBLE typedef double src_t; typedef uint64_t src_rep_t; #define SRC_REP_C UINT64_C -static const int srcSigBits = 52; +static const int srcBits = sizeof(src_t) * CHAR_BIT; +static const int srcSigFracBits = 52; +// -1 accounts for the sign bit. +// srcBits - srcSigFracBits - 1 +static const int srcExpBits = 11; #elif defined SRC_QUAD -typedef long double src_t; +typedef tf_float src_t; typedef __uint128_t src_rep_t; #define SRC_REP_C (__uint128_t) -static const int srcSigBits = 112; +static const int srcBits = sizeof(src_t) * CHAR_BIT; +static const int srcSigFracBits = 112; +// -1 accounts for the sign bit. +// srcBits - srcSigFracBits - 1 +static const int srcExpBits = 15; #else #error Source should be double precision or quad precision! @@ -41,13 +53,32 @@ static const int srcSigBits = 112; typedef double dst_t; typedef uint64_t dst_rep_t; #define DST_REP_C UINT64_C -static const int dstSigBits = 52; +static const int dstBits = sizeof(dst_t) * CHAR_BIT; +static const int dstSigFracBits = 52; +// -1 accounts for the sign bit. +// dstBits - dstSigFracBits - 1 +static const int dstExpBits = 11; + +#elif defined DST_80 +typedef xf_float dst_t; +typedef __uint128_t dst_rep_t; +#define DST_REP_C (__uint128_t) +static const int dstBits = 80; +static const int dstSigFracBits = 63; +// -1 accounts for the sign bit. +// -1 accounts for the explicitly stored integer bit. +// dstBits - dstSigFracBits - 1 - 1 +static const int dstExpBits = 15; #elif defined DST_SINGLE typedef float dst_t; typedef uint32_t dst_rep_t; #define DST_REP_C UINT32_C -static const int dstSigBits = 23; +static const int dstBits = sizeof(dst_t) * CHAR_BIT; +static const int dstSigFracBits = 23; +// -1 accounts for the sign bit. +// dstBits - dstSigFracBits - 1 +static const int dstExpBits = 8; #elif defined DST_HALF #ifdef COMPILER_RT_HAS_FLOAT16 @@ -57,22 +88,58 @@ typedef uint16_t dst_t; #endif typedef uint16_t dst_rep_t; #define DST_REP_C UINT16_C -static const int dstSigBits = 10; +static const int dstBits = sizeof(dst_t) * CHAR_BIT; +static const int dstSigFracBits = 10; +// -1 accounts for the sign bit. +// dstBits - dstSigFracBits - 1 +static const int dstExpBits = 5; #elif defined DST_BFLOAT typedef __bf16 dst_t; typedef uint16_t dst_rep_t; #define DST_REP_C UINT16_C -static const int dstSigBits = 7; +static const int dstBits = sizeof(dst_t) * CHAR_BIT; +static const int dstSigFracBits = 7; +// -1 accounts for the sign bit. +// dstBits - dstSigFracBits - 1 +static const int dstExpBits = 8; #else #error Destination should be single precision or double precision! #endif // end destination precision +// TODO: These helper routines should be placed into fp_lib.h +// Currently they depend on macros/constants defined above. + +static inline src_rep_t extract_sign_from_src(src_rep_t x) { + const src_rep_t srcSignMask = SRC_REP_C(1) << (srcBits - 1); + return (x & srcSignMask) >> (srcBits - 1); +} + +static inline src_rep_t extract_exp_from_src(src_rep_t x) { + const int srcSigBits = srcBits - 1 - srcExpBits; + const src_rep_t srcExpMask = ((SRC_REP_C(1) << srcExpBits) - 1) << srcSigBits; + return (x & srcExpMask) >> srcSigBits; +} + +static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) { + const src_rep_t srcSigFracMask = (SRC_REP_C(1) << srcSigFracBits) - 1; + return x & srcSigFracMask; +} + +static inline dst_rep_t construct_dst_rep(dst_rep_t sign, dst_rep_t exp, dst_rep_t sigFrac) { + dst_rep_t result = (sign << (dstBits - 1)) | (exp << (dstBits - 1 - dstExpBits)) | sigFrac; + // Set the explicit integer bit in F80 if present. + if (dstBits == 80 && exp) { + result |= (DST_REP_C(1) << dstSigFracBits); + } + return result; +} + // End of specialization parameters. Two helper routines for conversion to and // from the representation of floating-point data as integer values follow. -static __inline src_rep_t srcToRep(src_t x) { +static inline src_rep_t srcToRep(src_t x) { const union { src_t f; src_rep_t i; @@ -80,7 +147,7 @@ static __inline src_rep_t srcToRep(src_t x) { return rep.i; } -static __inline dst_t dstFromRep(dst_rep_t x) { +static inline dst_t dstFromRep(dst_rep_t x) { const union { dst_t f; dst_rep_t i; diff --git a/compiler-rt/lib/builtins/fp_trunc_impl.inc b/compiler-rt/lib/builtins/fp_trunc_impl.inc index 6662be7..f684924 100644 --- a/compiler-rt/lib/builtins/fp_trunc_impl.inc +++ b/compiler-rt/lib/builtins/fp_trunc_impl.inc @@ -38,95 +38,118 @@ #include "fp_trunc.h" +// The destination type may use a usual IEEE-754 interchange format or Intel +// 80-bit format. In particular, for the destination type dstSigFracBits may be +// not equal to dstSigBits. The source type is assumed to be one of IEEE-754 +// standard types. static __inline dst_t __truncXfYf2__(src_t a) { // Various constants whose values follow from the type parameters. // Any reasonable optimizer will fold and propagate all of these. - const int srcBits = sizeof(src_t) * CHAR_BIT; - const int srcExpBits = srcBits - srcSigBits - 1; const int srcInfExp = (1 << srcExpBits) - 1; const int srcExpBias = srcInfExp >> 1; - const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigBits; - const src_rep_t srcSignificandMask = srcMinNormal - 1; - const src_rep_t srcInfinity = (src_rep_t)srcInfExp << srcSigBits; - const src_rep_t srcSignMask = SRC_REP_C(1) << (srcSigBits + srcExpBits); - const src_rep_t srcAbsMask = srcSignMask - 1; - const src_rep_t roundMask = (SRC_REP_C(1) << (srcSigBits - dstSigBits)) - 1; - const src_rep_t halfway = SRC_REP_C(1) << (srcSigBits - dstSigBits - 1); - const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigBits - 1); + const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigFracBits; + const src_rep_t roundMask = + (SRC_REP_C(1) << (srcSigFracBits - dstSigFracBits)) - 1; + const src_rep_t halfway = SRC_REP_C(1) + << (srcSigFracBits - dstSigFracBits - 1); + const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigFracBits - 1); const src_rep_t srcNaNCode = srcQNaN - 1; - const int dstBits = sizeof(dst_t) * CHAR_BIT; - const int dstExpBits = dstBits - dstSigBits - 1; const int dstInfExp = (1 << dstExpBits) - 1; const int dstExpBias = dstInfExp >> 1; - - const int underflowExponent = srcExpBias + 1 - dstExpBias; const int overflowExponent = srcExpBias + dstInfExp - dstExpBias; - const src_rep_t underflow = (src_rep_t)underflowExponent << srcSigBits; - const src_rep_t overflow = (src_rep_t)overflowExponent << srcSigBits; - const dst_rep_t dstQNaN = DST_REP_C(1) << (dstSigBits - 1); + const dst_rep_t dstQNaN = DST_REP_C(1) << (dstSigFracBits - 1); const dst_rep_t dstNaNCode = dstQNaN - 1; - // Break a into a sign and representation of the absolute value. const src_rep_t aRep = srcToRep(a); - const src_rep_t aAbs = aRep & srcAbsMask; - const src_rep_t sign = aRep & srcSignMask; - dst_rep_t absResult; + const src_rep_t srcSign = extract_sign_from_src(aRep); + const src_rep_t srcExp = extract_exp_from_src(aRep); + const src_rep_t srcSigFrac = extract_sig_frac_from_src(aRep); + + dst_rep_t dstSign = srcSign; + dst_rep_t dstExp; + dst_rep_t dstSigFrac; + + // Same size exponents and a's significand tail is 0. + // The significand can be truncated and the exponent can be copied over. + const int sigFracTailBits = srcSigFracBits - dstSigFracBits; + if (srcExpBits == dstExpBits && + ((aRep >> sigFracTailBits) << sigFracTailBits) == aRep) { + dstExp = srcExp; + dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits); + return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac)); + } - if (aAbs - underflow < aAbs - overflow) { + const int dstExpCandidate = ((int)srcExp - srcExpBias) + dstExpBias; + if (dstExpCandidate >= 1 && dstExpCandidate < dstInfExp) { // The exponent of a is within the range of normal numbers in the - // destination format. We can convert by simply right-shifting with + // destination format. We can convert by simply right-shifting with // rounding and adjusting the exponent. - absResult = aAbs >> (srcSigBits - dstSigBits); - absResult -= (dst_rep_t)(srcExpBias - dstExpBias) << dstSigBits; + dstExp = dstExpCandidate; + dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits); - const src_rep_t roundBits = aAbs & roundMask; + const src_rep_t roundBits = srcSigFrac & roundMask; // Round to nearest. if (roundBits > halfway) - absResult++; + dstSigFrac++; // Tie to even. else if (roundBits == halfway) - absResult += absResult & 1; - } else if (aAbs > srcInfinity) { + dstSigFrac += dstSigFrac & 1; + + // Rounding has changed the exponent. + if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) { + dstExp += 1; + dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits); + } + } else if (srcExp == srcInfExp && srcSigFrac) { // a is NaN. // Conjure the result by beginning with infinity, setting the qNaN // bit and inserting the (truncated) trailing NaN field. - absResult = (dst_rep_t)dstInfExp << dstSigBits; - absResult |= dstQNaN; - absResult |= - ((aAbs & srcNaNCode) >> (srcSigBits - dstSigBits)) & dstNaNCode; - } else if (aAbs >= overflow) { - // a overflows to infinity. - absResult = (dst_rep_t)dstInfExp << dstSigBits; + dstExp = dstInfExp; + dstSigFrac = dstQNaN; + dstSigFrac |= ((srcSigFrac & srcNaNCode) >> sigFracTailBits) & dstNaNCode; + } else if ((int)srcExp >= overflowExponent) { + dstExp = dstInfExp; + dstSigFrac = 0; } else { // a underflows on conversion to the destination type or is an exact // zero. The result may be a denormal or zero. Extract the exponent // to get the shift amount for the denormalization. - const int aExp = aAbs >> srcSigBits; - const int shift = srcExpBias - dstExpBias - aExp + 1; + src_rep_t significand = srcSigFrac; + int shift = srcExpBias - dstExpBias - srcExp; - const src_rep_t significand = (aRep & srcSignificandMask) | srcMinNormal; + if (srcExp) { + // Set the implicit integer bit if the source is a normal number. + significand |= srcMinNormal; + shift += 1; + } // Right shift by the denormalization amount with sticky. - if (shift > srcSigBits) { - absResult = 0; + if (shift > srcSigFracBits) { + dstExp = 0; + dstSigFrac = 0; } else { - const bool sticky = (significand << (srcBits - shift)) != 0; + dstExp = 0; + const bool sticky = shift && ((significand << (srcBits - shift)) != 0); src_rep_t denormalizedSignificand = significand >> shift | sticky; - absResult = denormalizedSignificand >> (srcSigBits - dstSigBits); + dstSigFrac = denormalizedSignificand >> sigFracTailBits; const src_rep_t roundBits = denormalizedSignificand & roundMask; // Round to nearest if (roundBits > halfway) - absResult++; + dstSigFrac++; // Ties to even else if (roundBits == halfway) - absResult += absResult & 1; + dstSigFrac += dstSigFrac & 1; + + // Rounding has changed the exponent. + if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) { + dstExp += 1; + dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits); + } } } - // Apply the signbit to the absolute value. - const dst_rep_t result = absResult | sign >> (srcBits - dstBits); - return dstFromRep(result); + return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac)); } diff --git a/compiler-rt/lib/builtins/gcc_personality_v0.c b/compiler-rt/lib/builtins/gcc_personality_v0.c index 58fd7ce..ef63a5f 100644 --- a/compiler-rt/lib/builtins/gcc_personality_v0.c +++ b/compiler-rt/lib/builtins/gcc_personality_v0.c @@ -219,7 +219,7 @@ COMPILER_RT_ABI _Unwind_Reason_Code __gcc_personality_v0( } // Walk call-site table looking for range that includes current PC. uint8_t callSiteEncoding = *lsda++; - uint32_t callSiteTableLength = readULEB128(&lsda); + size_t callSiteTableLength = readULEB128(&lsda); const uint8_t *callSiteTableStart = lsda; const uint8_t *callSiteTableEnd = callSiteTableStart + callSiteTableLength; const uint8_t *p = callSiteTableStart; diff --git a/compiler-rt/lib/builtins/i386/chkstk.S b/compiler-rt/lib/builtins/i386/chkstk.S index f0bea21..cdd9a4c 100644 --- a/compiler-rt/lib/builtins/i386/chkstk.S +++ b/compiler-rt/lib/builtins/i386/chkstk.S @@ -4,19 +4,19 @@ #include "../assembly.h" -// _chkstk routine +#ifdef __i386__ + +// _chkstk (_alloca) routine - probe stack between %esp and (%esp-%eax) in 4k increments, +// then decrement %esp by %eax. Preserves all registers except %esp and flags. // This routine is windows specific // http://msdn.microsoft.com/en-us/library/ms648426.aspx -#ifdef __i386__ - .text .balign 4 -DEFINE_COMPILERRT_FUNCTION(__chkstk_ms) +DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function push %ecx - push %eax cmp $0x1000,%eax - lea 12(%esp),%ecx + lea 8(%esp),%ecx // esp before calling this routine -> ecx jb 1f 2: sub $0x1000,%ecx @@ -27,9 +27,13 @@ DEFINE_COMPILERRT_FUNCTION(__chkstk_ms) 1: sub %eax,%ecx test %ecx,(%ecx) - pop %eax - pop %ecx + + lea 4(%esp),%eax // load pointer to the return address into eax + mov %ecx,%esp // install the new top of stack pointer into esp + mov -4(%eax),%ecx // restore ecx + push (%eax) // push return address onto the stack + sub %esp,%eax // restore the original value in eax ret -END_COMPILERRT_FUNCTION(__chkstk_ms) +END_COMPILERRT_FUNCTION(_alloca) #endif // __i386__ diff --git a/compiler-rt/lib/builtins/i386/chkstk2.S b/compiler-rt/lib/builtins/i386/chkstk2.S deleted file mode 100644 index 5d6cbdf..0000000 --- a/compiler-rt/lib/builtins/i386/chkstk2.S +++ /dev/null @@ -1,41 +0,0 @@ -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "../assembly.h" - -#ifdef __i386__ - -// _chkstk (_alloca) routine - probe stack between %esp and (%esp-%eax) in 4k increments, -// then decrement %esp by %eax. Preserves all registers except %esp and flags. -// This routine is windows specific -// http://msdn.microsoft.com/en-us/library/ms648426.aspx - -.text -.balign 4 -DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function -DEFINE_COMPILERRT_FUNCTION(__chkstk) - push %ecx - cmp $0x1000,%eax - lea 8(%esp),%ecx // esp before calling this routine -> ecx - jb 1f -2: - sub $0x1000,%ecx - test %ecx,(%ecx) - sub $0x1000,%eax - cmp $0x1000,%eax - ja 2b -1: - sub %eax,%ecx - test %ecx,(%ecx) - - lea 4(%esp),%eax // load pointer to the return address into eax - mov %ecx,%esp // install the new top of stack pointer into esp - mov -4(%eax),%ecx // restore ecx - push (%eax) // push return address onto the stack - sub %esp,%eax // restore the original value in eax - ret -END_COMPILERRT_FUNCTION(__chkstk) -END_COMPILERRT_FUNCTION(_alloca) - -#endif // __i386__ diff --git a/compiler-rt/lib/builtins/i386/floatdixf.S b/compiler-rt/lib/builtins/i386/floatdixf.S index 19dd083..486e3b0 100644 --- a/compiler-rt/lib/builtins/i386/floatdixf.S +++ b/compiler-rt/lib/builtins/i386/floatdixf.S @@ -4,7 +4,7 @@ #include "../assembly.h" -// long double __floatdixf(di_int a); +// xf_float __floatdixf(di_int a); #ifdef __i386__ diff --git a/compiler-rt/lib/builtins/i386/floatundixf.S b/compiler-rt/lib/builtins/i386/floatundixf.S index 30b4d9f..778c3dc 100644 --- a/compiler-rt/lib/builtins/i386/floatundixf.S +++ b/compiler-rt/lib/builtins/i386/floatundixf.S @@ -4,7 +4,7 @@ #include "../assembly.h" -// long double __floatundixf(du_int a);16 +// xf_float __floatundixf(du_int a);16 #ifdef __i386__ diff --git a/compiler-rt/lib/builtins/int_lib.h b/compiler-rt/lib/builtins/int_lib.h index fb791eb..f6c1b7c 100644 --- a/compiler-rt/lib/builtins/int_lib.h +++ b/compiler-rt/lib/builtins/int_lib.h @@ -49,7 +49,7 @@ #define SYMBOL_NAME(name) XSTR(__USER_LABEL_PREFIX__) #name #if defined(__ELF__) || defined(__MINGW32__) || defined(__wasm__) || \ - defined(_AIX) + defined(_AIX) || defined(__CYGWIN__) #define COMPILER_RT_ALIAS(name, aliasname) \ COMPILER_RT_ABI __typeof(name) aliasname __attribute__((__alias__(#name))); #elif defined(__APPLE__) @@ -119,14 +119,14 @@ COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem); #if defined(_MSC_VER) && !defined(__clang__) #include -int __inline __builtin_ctz(uint32_t value) { +static int __inline __builtin_ctz(uint32_t value) { unsigned long trailing_zero = 0; if (_BitScanForward(&trailing_zero, value)) return trailing_zero; return 32; } -int __inline __builtin_clz(uint32_t value) { +static int __inline __builtin_clz(uint32_t value) { unsigned long leading_zero = 0; if (_BitScanReverse(&leading_zero, value)) return 31 - leading_zero; @@ -134,14 +134,14 @@ int __inline __builtin_clz(uint32_t value) { } #if defined(_M_ARM) || defined(_M_X64) -int __inline __builtin_clzll(uint64_t value) { +static int __inline __builtin_clzll(uint64_t value) { unsigned long leading_zero = 0; if (_BitScanReverse64(&leading_zero, value)) return 63 - leading_zero; return 64; } #else -int __inline __builtin_clzll(uint64_t value) { +static int __inline __builtin_clzll(uint64_t value) { if (value == 0) return 64; uint32_t msh = (uint32_t)(value >> 32); @@ -154,7 +154,7 @@ int __inline __builtin_clzll(uint64_t value) { #define __builtin_clzl __builtin_clzll -bool __inline __builtin_sadd_overflow(int x, int y, int *result) { +static bool __inline __builtin_sadd_overflow(int x, int y, int *result) { if ((x < 0) != (y < 0)) { *result = x + y; return false; diff --git a/compiler-rt/lib/builtins/int_math.h b/compiler-rt/lib/builtins/int_math.h index 48b9580..08bfe92 100644 --- a/compiler-rt/lib/builtins/int_math.h +++ b/compiler-rt/lib/builtins/int_math.h @@ -65,6 +65,14 @@ #define crt_copysign(x, y) __builtin_copysign((x), (y)) #define crt_copysignf(x, y) __builtin_copysignf((x), (y)) #define crt_copysignl(x, y) __builtin_copysignl((x), (y)) +// We define __has_builtin to always return 0 for GCC versions below 10, +// but __builtin_copysignf128 is available since version 7. +#if __has_builtin(__builtin_copysignf128) || \ + (defined(__GNUC__) && __GNUC__ >= 7) +#define crt_copysignf128(x, y) __builtin_copysignf128((x), (y)) +#elif __has_builtin(__builtin_copysignq) +#define crt_copysignf128(x, y) __builtin_copysignq((x), (y)) +#endif #endif #if defined(_MSC_VER) && !defined(__clang__) @@ -75,6 +83,13 @@ #define crt_fabs(x) __builtin_fabs((x)) #define crt_fabsf(x) __builtin_fabsf((x)) #define crt_fabsl(x) __builtin_fabsl((x)) +// We define __has_builtin to always return 0 for GCC versions below 10, +// but __builtin_fabsf128 is available since version 7. +#if __has_builtin(__builtin_fabsf128) || (defined(__GNUC__) && __GNUC__ >= 7) +#define crt_fabsf128(x) __builtin_fabsf128((x)) +#elif __has_builtin(__builtin_fabsq) +#define crt_fabsf128(x) __builtin_fabsq((x)) +#endif #endif #if defined(_MSC_VER) && !defined(__clang__) diff --git a/compiler-rt/lib/builtins/int_mulo_impl.inc b/compiler-rt/lib/builtins/int_mulo_impl.inc index 567d8b9..27e7c8c 100644 --- a/compiler-rt/lib/builtins/int_mulo_impl.inc +++ b/compiler-rt/lib/builtins/int_mulo_impl.inc @@ -18,10 +18,10 @@ static __inline fixint_t __muloXi4(fixint_t a, fixint_t b, int *overflow) { const int N = (int)(sizeof(fixint_t) * CHAR_BIT); - const fixint_t MIN = (fixint_t)1 << (N - 1); + const fixint_t MIN = (fixint_t)((fixuint_t)1 << (N - 1)); const fixint_t MAX = ~MIN; *overflow = 0; - fixint_t result = a * b; + fixint_t result = (fixuint_t)a * b; if (a == MIN) { if (b != 0 && b != 1) *overflow = 1; diff --git a/compiler-rt/lib/builtins/int_mulv_impl.inc b/compiler-rt/lib/builtins/int_mulv_impl.inc index 1e92071..06559cf 100644 --- a/compiler-rt/lib/builtins/int_mulv_impl.inc +++ b/compiler-rt/lib/builtins/int_mulv_impl.inc @@ -18,7 +18,7 @@ static __inline fixint_t __mulvXi3(fixint_t a, fixint_t b) { const int N = (int)(sizeof(fixint_t) * CHAR_BIT); - const fixint_t MIN = (fixint_t)1 << (N - 1); + const fixint_t MIN = (fixint_t)((fixuint_t)1 << (N - 1)); const fixint_t MAX = ~MIN; if (a == MIN) { if (b == 0 || b == 1) diff --git a/compiler-rt/lib/builtins/int_to_fp.h b/compiler-rt/lib/builtins/int_to_fp.h new file mode 100644 index 0000000..2c1218f --- /dev/null +++ b/compiler-rt/lib/builtins/int_to_fp.h @@ -0,0 +1,82 @@ +//===-- int_to_fp.h - integer to floating point conversion ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Set source and destination defines in order to use a correctly +// parameterised floatXiYf implementation. +// +//===----------------------------------------------------------------------===// + +#ifndef INT_TO_FP_H +#define INT_TO_FP_H + +#include "int_lib.h" + +#if defined SRC_I64 +typedef int64_t src_t; +typedef uint64_t usrc_t; +static __inline int clzSrcT(usrc_t x) { return __builtin_clzll(x); } + +#elif defined SRC_U64 +typedef uint64_t src_t; +typedef uint64_t usrc_t; +static __inline int clzSrcT(usrc_t x) { return __builtin_clzll(x); } + +#elif defined SRC_I128 +typedef __int128_t src_t; +typedef __uint128_t usrc_t; +static __inline int clzSrcT(usrc_t x) { return __clzti2(x); } + +#elif defined SRC_U128 +typedef __uint128_t src_t; +typedef __uint128_t usrc_t; +static __inline int clzSrcT(usrc_t x) { return __clzti2(x); } + +#else +#error Source should be a handled integer type. +#endif + +#if defined DST_SINGLE +typedef float dst_t; +typedef uint32_t dst_rep_t; +#define DST_REP_C UINT32_C + +enum { + dstSigBits = 23, +}; + +#elif defined DST_DOUBLE +typedef double dst_t; +typedef uint64_t dst_rep_t; +#define DST_REP_C UINT64_C + +enum { + dstSigBits = 52, +}; + +#elif defined DST_QUAD +typedef tf_float dst_t; +typedef __uint128_t dst_rep_t; +#define DST_REP_C (__uint128_t) + +enum { + dstSigBits = 112, +}; + +#else +#error Destination should be a handled floating point type +#endif + +static __inline dst_t dstFromRep(dst_rep_t x) { + const union { + dst_t f; + dst_rep_t i; + } rep = {.i = x}; + return rep.f; +} + +#endif // INT_TO_FP_H diff --git a/compiler-rt/lib/builtins/int_to_fp_impl.inc b/compiler-rt/lib/builtins/int_to_fp_impl.inc new file mode 100644 index 0000000..51f76fd --- /dev/null +++ b/compiler-rt/lib/builtins/int_to_fp_impl.inc @@ -0,0 +1,72 @@ +//===-- int_to_fp_impl.inc - integer to floating point conversion ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Thsi file implements a generic conversion from an integer type to an +// IEEE-754 floating point type, allowing a common implementation to be hsared +// without copy and paste. +// +//===----------------------------------------------------------------------===// + +#include "int_to_fp.h" + +static __inline dst_t __floatXiYf__(src_t a) { + if (a == 0) + return 0.0; + + enum { + dstMantDig = dstSigBits + 1, + srcBits = sizeof(src_t) * CHAR_BIT, + srcIsSigned = ((src_t)-1) < 0, + }; + + const src_t s = srcIsSigned ? a >> (srcBits - 1) : 0; + + a = (usrc_t)(a ^ s) - s; + int sd = srcBits - clzSrcT(a); // number of significant digits + int e = sd - 1; // exponent + if (sd > dstMantDig) { + // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx + // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR + // 12345678901234567890123456 + // 1 = msb 1 bit + // P = bit dstMantDig-1 bits to the right of 1 + // Q = bit dstMantDig bits to the right of 1 + // R = "or" of all bits to the right of Q + if (sd == dstMantDig + 1) { + a <<= 1; + } else if (sd == dstMantDig + 2) { + // Do nothing. + } else { + a = ((usrc_t)a >> (sd - (dstMantDig + 2))) | + ((a & ((usrc_t)(-1) >> ((srcBits + dstMantDig + 2) - sd))) != 0); + } + // finish: + a |= (a & 4) != 0; // Or P into R + ++a; // round - this step may add a significant bit + a >>= 2; // dump Q and R + // a is now rounded to dstMantDig or dstMantDig+1 bits + if (a & ((usrc_t)1 << dstMantDig)) { + a >>= 1; + ++e; + } + // a is now rounded to dstMantDig bits + } else { + a <<= (dstMantDig - sd); + // a is now rounded to dstMantDig bits + } + const int dstBits = sizeof(dst_t) * CHAR_BIT; + const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1); + const int dstExpBits = dstBits - dstSigBits - 1; + const int dstExpBias = (1 << (dstExpBits - 1)) - 1; + const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1; + // Combine sign, exponent, and mantissa. + const dst_rep_t result = ((dst_rep_t)s & dstSignMask) | + ((dst_rep_t)(e + dstExpBias) << dstSigBits) | + ((dst_rep_t)(a) & dstSignificandMask); + return dstFromRep(result); +} diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h index e94d315..48862f3 100644 --- a/compiler-rt/lib/builtins/int_types.h +++ b/compiler-rt/lib/builtins/int_types.h @@ -107,8 +107,8 @@ typedef union { static __inline ti_int make_ti(di_int h, di_int l) { twords r; - r.s.high = h; - r.s.low = l; + r.s.high = (du_int)h; + r.s.low = (du_int)l; return r.all; } @@ -139,7 +139,6 @@ typedef union { udwords u; double f; } double_bits; -#endif typedef struct { #if _YUGA_LITTLE_ENDIAN @@ -165,16 +164,83 @@ typedef struct { #define HAS_80_BIT_LONG_DOUBLE 0 #endif -#if CRT_HAS_FLOATING_POINT +#if HAS_80_BIT_LONG_DOUBLE +typedef long double xf_float; +typedef union { + uqwords u; + xf_float f; +} xf_bits; +#endif + +#ifdef __powerpc64__ +// From https://gcc.gnu.org/wiki/Ieee128PowerPC: +// PowerPC64 uses the following suffixes: +// IFmode: IBM extended double +// KFmode: IEEE 128-bit floating point +// TFmode: Matches the default for long double. With -mabi=ieeelongdouble, +// it is IEEE 128-bit, with -mabi=ibmlongdouble IBM extended double +// Since compiler-rt only implements the tf set of libcalls, we use long double +// for the tf_float typedef. +typedef long double tf_float; +#define CRT_LDBL_128BIT +#define CRT_HAS_F128 +#if __LDBL_MANT_DIG__ == 113 && !defined(__LONG_DOUBLE_IBM128__) +#define CRT_HAS_IEEE_TF +#define CRT_LDBL_IEEE_F128 +#endif +#define TF_C(x) x##L +#elif __LDBL_MANT_DIG__ == 113 || \ + (__FLT_RADIX__ == 16 && __LDBL_MANT_DIG__ == 28) +// Use long double instead of __float128 if it matches the IEEE 128-bit format +// or the IBM hexadecimal format. +#define CRT_LDBL_128BIT +#define CRT_HAS_F128 +#if __LDBL_MANT_DIG__ == 113 +#define CRT_HAS_IEEE_TF +#define CRT_LDBL_IEEE_F128 +#endif +typedef long double tf_float; +#define TF_C(x) x##L +#elif defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__) +#define CRT_HAS___FLOAT128_KEYWORD +#define CRT_HAS_F128 +// NB: we assume the __float128 type uses IEEE representation. +#define CRT_HAS_IEEE_TF +typedef __float128 tf_float; +#define TF_C(x) x##Q +#endif + +#ifdef CRT_HAS_F128 typedef union { uqwords u; - long double f; -} long_double_bits; + tf_float f; +} tf_bits; +#endif + +// __(u)int128_t is currently needed to compile the *tf builtins as we would +// otherwise need to manually expand the bit manipulation on two 64-bit value. +#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128) +#define CRT_HAS_TF_MODE +#endif #if __STDC_VERSION__ >= 199901L typedef float _Complex Fcomplex; typedef double _Complex Dcomplex; typedef long double _Complex Lcomplex; +#if defined(CRT_LDBL_128BIT) +typedef Lcomplex Qcomplex; +#define CRT_HAS_NATIVE_COMPLEX_F128 +#elif defined(CRT_HAS___FLOAT128_KEYWORD) +#if defined(__clang_major__) && __clang_major__ > 10 +// Clang prior to 11 did not support __float128 _Complex. +typedef __float128 _Complex Qcomplex; +#define CRT_HAS_NATIVE_COMPLEX_F128 +#elif defined(__GNUC__) && __GNUC__ >= 7 +// GCC does not allow __float128 _Complex, but accepts _Float128 _Complex. +typedef _Float128 _Complex Qcomplex; +#define CRT_HAS_NATIVE_COMPLEX_F128 +#endif +#endif #define COMPLEX_REAL(x) __real__(x) #define COMPLEX_IMAGINARY(x) __imag__(x) @@ -194,5 +260,17 @@ typedef struct { #define COMPLEX_REAL(x) (x).real #define COMPLEX_IMAGINARY(x) (x).imaginary #endif + +#ifdef CRT_HAS_NATIVE_COMPLEX_F128 +#define COMPLEXTF_REAL(x) __real__(x) +#define COMPLEXTF_IMAGINARY(x) __imag__(x) +#elif defined(CRT_HAS_F128) +typedef struct { + tf_float real, imaginary; +} Qcomplex; +#define COMPLEXTF_REAL(x) (x).real +#define COMPLEXTF_IMAGINARY(x) (x).imaginary #endif + +#endif // CRT_HAS_FLOATING_POINT #endif // INT_TYPES_H diff --git a/compiler-rt/lib/builtins/lshrti3.c b/compiler-rt/lib/builtins/lshrti3.c index d00a220..5dc8a0a 100644 --- a/compiler-rt/lib/builtins/lshrti3.c +++ b/compiler-rt/lib/builtins/lshrti3.c @@ -18,7 +18,7 @@ // Precondition: 0 <= b < bits_in_tword -COMPILER_RT_ABI ti_int __lshrti3(ti_int a, si_int b) { +COMPILER_RT_ABI ti_int __lshrti3(ti_int a, int b) { const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT); utwords input; utwords result; diff --git a/compiler-rt/lib/builtins/mulodi4.c b/compiler-rt/lib/builtins/mulodi4.c index 7209676..6ecf926 100644 --- a/compiler-rt/lib/builtins/mulodi4.c +++ b/compiler-rt/lib/builtins/mulodi4.c @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #define fixint_t di_int +#define fixuint_t du_int #include "int_mulo_impl.inc" // Returns: a * b diff --git a/compiler-rt/lib/builtins/mulosi4.c b/compiler-rt/lib/builtins/mulosi4.c index 4e03c24..3fd18a1 100644 --- a/compiler-rt/lib/builtins/mulosi4.c +++ b/compiler-rt/lib/builtins/mulosi4.c @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #define fixint_t si_int +#define fixuint_t su_int #include "int_mulo_impl.inc" // Returns: a * b diff --git a/compiler-rt/lib/builtins/muloti4.c b/compiler-rt/lib/builtins/muloti4.c index 9a7aa85..9aab6fc 100644 --- a/compiler-rt/lib/builtins/muloti4.c +++ b/compiler-rt/lib/builtins/muloti4.c @@ -19,6 +19,7 @@ // Effects: sets *overflow to 1 if a * b overflows #define fixint_t ti_int +#define fixuint_t tu_int #include "int_mulo_impl.inc" COMPILER_RT_ABI ti_int __muloti4(ti_int a, ti_int b, int *overflow) { diff --git a/compiler-rt/lib/builtins/multc3.c b/compiler-rt/lib/builtins/multc3.c index bb7f6aa..a89832f 100644 --- a/compiler-rt/lib/builtins/multc3.c +++ b/compiler-rt/lib/builtins/multc3.c @@ -10,56 +10,61 @@ // //===----------------------------------------------------------------------===// +#define QUAD_PRECISION +#include "fp_lib.h" #include "int_lib.h" #include "int_math.h" +#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128) + // Returns: the product of a + ib and c + id -COMPILER_RT_ABI long double _Complex __multc3(long double a, long double b, - long double c, long double d) { - long double ac = a * c; - long double bd = b * d; - long double ad = a * d; - long double bc = b * c; - long double _Complex z; - __real__ z = ac - bd; - __imag__ z = ad + bc; - if (crt_isnan(__real__ z) && crt_isnan(__imag__ z)) { +COMPILER_RT_ABI Qcomplex __multc3(fp_t a, fp_t b, fp_t c, fp_t d) { + fp_t ac = a * c; + fp_t bd = b * d; + fp_t ad = a * d; + fp_t bc = b * c; + Qcomplex z; + COMPLEXTF_REAL(z) = ac - bd; + COMPLEXTF_IMAGINARY(z) = ad + bc; + if (crt_isnan(COMPLEXTF_REAL(z)) && crt_isnan(COMPLEXTF_IMAGINARY(z))) { int recalc = 0; if (crt_isinf(a) || crt_isinf(b)) { - a = crt_copysignl(crt_isinf(a) ? 1 : 0, a); - b = crt_copysignl(crt_isinf(b) ? 1 : 0, b); + a = crt_copysigntf(crt_isinf(a) ? 1 : 0, a); + b = crt_copysigntf(crt_isinf(b) ? 1 : 0, b); if (crt_isnan(c)) - c = crt_copysignl(0, c); + c = crt_copysigntf(0, c); if (crt_isnan(d)) - d = crt_copysignl(0, d); + d = crt_copysigntf(0, d); recalc = 1; } if (crt_isinf(c) || crt_isinf(d)) { - c = crt_copysignl(crt_isinf(c) ? 1 : 0, c); - d = crt_copysignl(crt_isinf(d) ? 1 : 0, d); + c = crt_copysigntf(crt_isinf(c) ? 1 : 0, c); + d = crt_copysigntf(crt_isinf(d) ? 1 : 0, d); if (crt_isnan(a)) - a = crt_copysignl(0, a); + a = crt_copysigntf(0, a); if (crt_isnan(b)) - b = crt_copysignl(0, b); + b = crt_copysigntf(0, b); recalc = 1; } if (!recalc && (crt_isinf(ac) || crt_isinf(bd) || crt_isinf(ad) || crt_isinf(bc))) { if (crt_isnan(a)) - a = crt_copysignl(0, a); + a = crt_copysigntf(0, a); if (crt_isnan(b)) - b = crt_copysignl(0, b); + b = crt_copysigntf(0, b); if (crt_isnan(c)) - c = crt_copysignl(0, c); + c = crt_copysigntf(0, c); if (crt_isnan(d)) - d = crt_copysignl(0, d); + d = crt_copysigntf(0, d); recalc = 1; } if (recalc) { - __real__ z = CRT_INFINITY * (a * c - b * d); - __imag__ z = CRT_INFINITY * (a * d + b * c); + COMPLEXTF_REAL(z) = CRT_INFINITY * (a * c - b * d); + COMPLEXTF_IMAGINARY(z) = CRT_INFINITY * (a * d + b * c); } } return z; } + +#endif diff --git a/compiler-rt/lib/builtins/multf3.c b/compiler-rt/lib/builtins/multf3.c index 0626fb8..8fd7368 100644 --- a/compiler-rt/lib/builtins/multf3.c +++ b/compiler-rt/lib/builtins/multf3.c @@ -14,7 +14,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) #include "fp_mul_impl.inc" COMPILER_RT_ABI fp_t __multf3(fp_t a, fp_t b) { return __mulXf3__(a, b); } diff --git a/compiler-rt/lib/builtins/mulvdi3.c b/compiler-rt/lib/builtins/mulvdi3.c index 1d672c6..d787d29 100644 --- a/compiler-rt/lib/builtins/mulvdi3.c +++ b/compiler-rt/lib/builtins/mulvdi3.c @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #define fixint_t di_int +#define fixuint_t du_int #include "int_mulv_impl.inc" // Returns: a * b diff --git a/compiler-rt/lib/builtins/mulvsi3.c b/compiler-rt/lib/builtins/mulvsi3.c index 00b2e50..2571881 100644 --- a/compiler-rt/lib/builtins/mulvsi3.c +++ b/compiler-rt/lib/builtins/mulvsi3.c @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #define fixint_t si_int +#define fixuint_t su_int #include "int_mulv_impl.inc" // Returns: a * b diff --git a/compiler-rt/lib/builtins/mulvti3.c b/compiler-rt/lib/builtins/mulvti3.c index ba35514..fad9b2a 100644 --- a/compiler-rt/lib/builtins/mulvti3.c +++ b/compiler-rt/lib/builtins/mulvti3.c @@ -19,6 +19,7 @@ // Effects: aborts if a * b overflows #define fixint_t ti_int +#define fixuint_t tu_int #include "int_mulv_impl.inc" COMPILER_RT_ABI ti_int __mulvti3(ti_int a, ti_int b) { return __mulvXi3(a, b); } diff --git a/compiler-rt/lib/builtins/mulxc3.c b/compiler-rt/lib/builtins/mulxc3.c index 2f7f14c..66b5b58 100644 --- a/compiler-rt/lib/builtins/mulxc3.c +++ b/compiler-rt/lib/builtins/mulxc3.c @@ -17,12 +17,12 @@ // Returns: the product of a + ib and c + id -COMPILER_RT_ABI Lcomplex __mulxc3(long double __a, long double __b, - long double __c, long double __d) { - long double __ac = __a * __c; - long double __bd = __b * __d; - long double __ad = __a * __d; - long double __bc = __b * __c; +COMPILER_RT_ABI Lcomplex __mulxc3(xf_float __a, xf_float __b, xf_float __c, + xf_float __d) { + xf_float __ac = __a * __c; + xf_float __bd = __b * __d; + xf_float __ad = __a * __d; + xf_float __bc = __b * __c; Lcomplex z; COMPLEX_REAL(z) = __ac - __bd; COMPLEX_IMAGINARY(z) = __ad + __bc; diff --git a/compiler-rt/lib/builtins/negdi2.c b/compiler-rt/lib/builtins/negdi2.c index 5a525d4..714ac8c 100644 --- a/compiler-rt/lib/builtins/negdi2.c +++ b/compiler-rt/lib/builtins/negdi2.c @@ -17,5 +17,5 @@ COMPILER_RT_ABI di_int __negdi2(di_int a) { // Note: this routine is here for API compatibility; any sane compiler // should expand it inline. - return -a; + return -(du_int)a; } diff --git a/compiler-rt/lib/builtins/negti2.c b/compiler-rt/lib/builtins/negti2.c index d52ba4e..ab6e09d 100644 --- a/compiler-rt/lib/builtins/negti2.c +++ b/compiler-rt/lib/builtins/negti2.c @@ -19,7 +19,7 @@ COMPILER_RT_ABI ti_int __negti2(ti_int a) { // Note: this routine is here for API compatibility; any sane compiler // should expand it inline. - return -a; + return -(tu_int)a; } #endif // CRT_HAS_128BIT diff --git a/compiler-rt/lib/builtins/negvdi2.c b/compiler-rt/lib/builtins/negvdi2.c index 5c52b3e..8c1cf2f 100644 --- a/compiler-rt/lib/builtins/negvdi2.c +++ b/compiler-rt/lib/builtins/negvdi2.c @@ -17,7 +17,8 @@ // Effects: aborts if -a overflows COMPILER_RT_ABI di_int __negvdi2(di_int a) { - const di_int MIN = (di_int)1 << ((int)(sizeof(di_int) * CHAR_BIT) - 1); + const di_int MIN = + (di_int)((du_int)1 << ((int)(sizeof(di_int) * CHAR_BIT) - 1)); if (a == MIN) compilerrt_abort(); return -a; diff --git a/compiler-rt/lib/builtins/negvsi2.c b/compiler-rt/lib/builtins/negvsi2.c index cccdee6..70f214f 100644 --- a/compiler-rt/lib/builtins/negvsi2.c +++ b/compiler-rt/lib/builtins/negvsi2.c @@ -17,7 +17,8 @@ // Effects: aborts if -a overflows COMPILER_RT_ABI si_int __negvsi2(si_int a) { - const si_int MIN = (si_int)1 << ((int)(sizeof(si_int) * CHAR_BIT) - 1); + const si_int MIN = + (si_int)((su_int)1 << ((int)(sizeof(si_int) * CHAR_BIT) - 1)); if (a == MIN) compilerrt_abort(); return -a; diff --git a/compiler-rt/lib/builtins/negvti2.c b/compiler-rt/lib/builtins/negvti2.c index 8f92e10..fc14840 100644 --- a/compiler-rt/lib/builtins/negvti2.c +++ b/compiler-rt/lib/builtins/negvti2.c @@ -19,7 +19,7 @@ // Effects: aborts if -a overflows COMPILER_RT_ABI ti_int __negvti2(ti_int a) { - const ti_int MIN = (ti_int)1 << ((int)(sizeof(ti_int) * CHAR_BIT) - 1); + const ti_int MIN = (tu_int)1 << ((int)(sizeof(ti_int) * CHAR_BIT) - 1); if (a == MIN) compilerrt_abort(); return -a; diff --git a/compiler-rt/lib/builtins/os_version_check.c b/compiler-rt/lib/builtins/os_version_check.c index ebfb2df..b10f23a 100644 --- a/compiler-rt/lib/builtins/os_version_check.c +++ b/compiler-rt/lib/builtins/os_version_check.c @@ -14,6 +14,7 @@ #ifdef __APPLE__ #include +#include #include #include #include @@ -86,6 +87,10 @@ typedef Boolean (*CFStringGetCStringFuncTy)(CFStringRef, char *, CFIndex, CFStringEncoding); typedef void (*CFReleaseFuncTy)(CFTypeRef); +extern __attribute__((weak_import)) +bool _availability_version_check(uint32_t count, + dyld_build_version_t versions[]); + static void _initializeAvailabilityCheck(bool LoadPlist) { if (AvailabilityVersionCheck && !LoadPlist) { // New API is supported and we're not being asked to load the plist, @@ -94,8 +99,8 @@ static void _initializeAvailabilityCheck(bool LoadPlist) { } // Use the new API if it's is available. - AvailabilityVersionCheck = (AvailabilityVersionCheckFuncTy)dlsym( - RTLD_DEFAULT, "_availability_version_check"); + if (_availability_version_check) + AvailabilityVersionCheck = &_availability_version_check; if (AvailabilityVersionCheck && !LoadPlist) { // New API is supported and we're not being asked to load the plist, @@ -266,6 +271,8 @@ static inline uint32_t ConstructVersion(uint32_t Major, uint32_t Minor, return ((Major & 0xffff) << 16) | ((Minor & 0xff) << 8) | (Subminor & 0xff); } +#define PLATFORM_MACOS 1 + int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major, uint32_t Minor, uint32_t Subminor) { dispatch_once_f(&DispatchOnceCounter, NULL, initializeAvailabilityCheck); @@ -278,6 +285,29 @@ int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major, return AvailabilityVersionCheck(1, Versions); } +#if TARGET_OS_OSX + +int32_t __isPlatformOrVariantPlatformVersionAtLeast( + uint32_t Platform, uint32_t Major, uint32_t Minor, uint32_t Subminor, + uint32_t Platform2, uint32_t Major2, uint32_t Minor2, uint32_t Subminor2) { + dispatch_once_f(&DispatchOnceCounter, NULL, initializeAvailabilityCheck); + + if (!AvailabilityVersionCheck) { + // Handle case of back-deployment for older macOS. + if (Platform == PLATFORM_MACOS) { + return __isOSVersionAtLeast(Major, Minor, Subminor); + } + assert(Platform2 == PLATFORM_MACOS && "unexpected platform"); + return __isOSVersionAtLeast(Major2, Minor2, Subminor2); + } + dyld_build_version_t Versions[] = { + {Platform, ConstructVersion(Major, Minor, Subminor)}, + {Platform2, ConstructVersion(Major2, Minor2, Subminor2)}}; + return AvailabilityVersionCheck(2, Versions); +} + +#endif + #elif __ANDROID__ #include @@ -312,8 +342,8 @@ int32_t __isOSVersionAtLeast(int32_t Major, int32_t Minor, int32_t Subminor) { static pthread_once_t once = PTHREAD_ONCE_INIT; pthread_once(&once, readSystemProperties); - return SdkVersion >= Major || - (IsPreRelease && Major == __ANDROID_API_FUTURE__); + // Allow all on pre-release. Note that we still rely on compile-time checks. + return SdkVersion >= Major || IsPreRelease; } #else diff --git a/compiler-rt/lib/builtins/powitf2.c b/compiler-rt/lib/builtins/powitf2.c index 8e639a0..e02db40 100644 --- a/compiler-rt/lib/builtins/powitf2.c +++ b/compiler-rt/lib/builtins/powitf2.c @@ -13,13 +13,13 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) // Returns: a ^ b -COMPILER_RT_ABI long double __powitf2(long double a, int b) { +COMPILER_RT_ABI fp_t __powitf2(fp_t a, int b) { const int recip = b < 0; - long double r = 1; + fp_t r = 1; while (1) { if (b & 1) r *= a; diff --git a/compiler-rt/lib/builtins/powixf2.c b/compiler-rt/lib/builtins/powixf2.c index 3edfe9f..ab8c694 100644 --- a/compiler-rt/lib/builtins/powixf2.c +++ b/compiler-rt/lib/builtins/powixf2.c @@ -16,9 +16,9 @@ // Returns: a ^ b -COMPILER_RT_ABI long double __powixf2(long double a, int b) { +COMPILER_RT_ABI xf_float __powixf2(xf_float a, int b) { const int recip = b < 0; - long double r = 1; + xf_float r = 1; while (1) { if (b & 1) r *= a; diff --git a/compiler-rt/lib/builtins/riscv/fp_mode.c b/compiler-rt/lib/builtins/riscv/fp_mode.c index c542c34..1a5a3de 100644 --- a/compiler-rt/lib/builtins/riscv/fp_mode.c +++ b/compiler-rt/lib/builtins/riscv/fp_mode.c @@ -15,7 +15,7 @@ #define RISCV_INEXACT 0x1 CRT_FE_ROUND_MODE __fe_getround(void) { -#if defined(__riscv_f) +#if defined(__riscv_f) || defined(__riscv_zfinx) int frm; __asm__ __volatile__("frrm %0" : "=r" (frm)); switch (frm) { @@ -35,7 +35,7 @@ CRT_FE_ROUND_MODE __fe_getround(void) { } int __fe_raise_inexact(void) { -#if defined(__riscv_f) +#if defined(__riscv_f) || defined(__riscv_zfinx) __asm__ __volatile__("csrsi fflags, %0" :: "i" (RISCV_INEXACT)); #endif return 0; diff --git a/compiler-rt/lib/builtins/riscv/restore.S b/compiler-rt/lib/builtins/riscv/restore.S index 73f64a9..d87dfc1 100644 --- a/compiler-rt/lib/builtins/riscv/restore.S +++ b/compiler-rt/lib/builtins/riscv/restore.S @@ -22,6 +22,8 @@ #if __riscv_xlen == 32 +#ifndef __riscv_abi_rve + .globl __riscv_restore_12 .type __riscv_restore_12,@function __riscv_restore_12: @@ -86,8 +88,29 @@ __riscv_restore_0: addi sp, sp, 16 ret +#else + + .globl __riscv_restore_2 + .type __riscv_restore_2,@function + .globl __riscv_restore_1 + .type __riscv_restore_1,@function + .globl __riscv_restore_0 + .type __riscv_restore_0,@function +__riscv_restore_2: +__riscv_restore_1: +__riscv_restore_0: + lw s1, 0(sp) + lw s0, 4(sp) + lw ra, 8(sp) + addi sp, sp, 12 + ret + +#endif + #elif __riscv_xlen == 64 +#ifndef __riscv_abi_rve + .globl __riscv_restore_12 .type __riscv_restore_12,@function __riscv_restore_12: @@ -161,6 +184,25 @@ __riscv_restore_0: addi sp, sp, 16 ret +#else + + .globl __riscv_restore_2 + .type __riscv_restore_2,@function + .globl __riscv_restore_1 + .type __riscv_restore_1,@function + .globl __riscv_restore_0 + .type __riscv_restore_0,@function +__riscv_restore_2: +__riscv_restore_1: +__riscv_restore_0: + ld s1, 0(sp) + ld s0, 8(sp) + ld ra, 16(sp) + addi sp, sp, 24 + ret + +#endif + #else # error "xlen must be 32 or 64 for save-restore implementation #endif diff --git a/compiler-rt/lib/builtins/riscv/save.S b/compiler-rt/lib/builtins/riscv/save.S index 85501ae..6324e05 100644 --- a/compiler-rt/lib/builtins/riscv/save.S +++ b/compiler-rt/lib/builtins/riscv/save.S @@ -18,6 +18,8 @@ #if __riscv_xlen == 32 +#ifndef __riscv_abi_rve + .globl __riscv_save_12 .type __riscv_save_12,@function __riscv_save_12: @@ -92,8 +94,29 @@ __riscv_save_0: sw ra, 12(sp) jr t0 +#else + + .globl __riscv_save_2 + .type __riscv_save_2,@function + .globl __riscv_save_1 + .type __riscv_save_1,@function + .globl __riscv_save_0 + .type __riscv_save_0,@function +__riscv_save_2: +__riscv_save_1: +__riscv_save_0: + addi sp, sp, -12 + sw s1, 0(sp) + sw s0, 4(sp) + sw ra, 8(sp) + jr t0 + +#endif + #elif __riscv_xlen == 64 +#ifndef __riscv_abi_rve + .globl __riscv_save_12 .type __riscv_save_12,@function __riscv_save_12: @@ -181,6 +204,25 @@ __riscv_save_0: sd ra, 8(sp) jr t0 +#else + + .globl __riscv_save_2 + .type __riscv_save_2,@function + .globl __riscv_save_1 + .type __riscv_save_1,@function + .globl __riscv_save_0 + .type __riscv_save_0,@function +__riscv_save_2: +__riscv_save_1: +__riscv_save_0: + addi sp, sp, -24 + sd s1, 0(sp) + sd s0, 8(sp) + sd ra, 16(sp) + jr t0 + +#endif + #else # error "xlen must be 32 or 64 for save-restore implementation #endif diff --git a/compiler-rt/lib/builtins/subtf3.c b/compiler-rt/lib/builtins/subtf3.c index 3364c28..e1b1022 100644 --- a/compiler-rt/lib/builtins/subtf3.c +++ b/compiler-rt/lib/builtins/subtf3.c @@ -13,7 +13,7 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) COMPILER_RT_ABI fp_t __addtf3(fp_t a, fp_t b); // Subtraction; flip the sign bit of b and add. diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c index 844eb27..830e25e 100644 --- a/compiler-rt/lib/builtins/trampoline_setup.c +++ b/compiler-rt/lib/builtins/trampoline_setup.c @@ -41,3 +41,45 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack, __clear_cache(trampOnStack, &trampOnStack[10]); } #endif // __powerpc__ && !defined(__powerpc64__) + +// The AArch64 compiler generates calls to __trampoline_setup() when creating +// trampoline functions on the stack for use with nested functions. +// This function creates a custom 36-byte trampoline function on the stack +// which loads x18 with a pointer to the outer function's locals +// and then jumps to the target nested function. +// Note: x18 is a reserved platform register on Windows and macOS. + +#if defined(__aarch64__) && defined(__ELF__) +COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack, + int trampSizeAllocated, + const void *realFunc, void *localsPtr) { + // This should never happen, but if compiler did not allocate + // enough space on stack for the trampoline, abort. + if (trampSizeAllocated < 36) + compilerrt_abort(); + + // create trampoline + // Load realFunc into x17. mov/movk 16 bits at a time. + trampOnStack[0] = + 0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11; + trampOnStack[1] = + 0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11; + trampOnStack[2] = + 0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11; + trampOnStack[3] = + 0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11; + // Load localsPtr into x18 + trampOnStack[4] = + 0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12; + trampOnStack[5] = + 0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12; + trampOnStack[6] = + 0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12; + trampOnStack[7] = + 0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12; + trampOnStack[8] = 0xd61f0220; // br x17 + + // Clear instruction cache. + __clear_cache(trampOnStack, &trampOnStack[9]); +} +#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64) diff --git a/compiler-rt/lib/builtins/trunctfdf2.c b/compiler-rt/lib/builtins/trunctfdf2.c index 6857ea5..a5bdded 100644 --- a/compiler-rt/lib/builtins/trunctfdf2.c +++ b/compiler-rt/lib/builtins/trunctfdf2.c @@ -9,11 +9,11 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) #define SRC_QUAD #define DST_DOUBLE #include "fp_trunc_impl.inc" -COMPILER_RT_ABI double __trunctfdf2(long double a) { return __truncXfYf2__(a); } +COMPILER_RT_ABI dst_t __trunctfdf2(src_t a) { return __truncXfYf2__(a); } #endif diff --git a/compiler-rt/lib/builtins/trunctfhf2.c b/compiler-rt/lib/builtins/trunctfhf2.c index e3a2309..3f031e0 100644 --- a/compiler-rt/lib/builtins/trunctfhf2.c +++ b/compiler-rt/lib/builtins/trunctfhf2.c @@ -10,14 +10,11 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) && \ - defined(COMPILER_RT_HAS_FLOAT16) +#if defined(CRT_HAS_TF_MODE) && defined(COMPILER_RT_HAS_FLOAT16) #define SRC_QUAD #define DST_HALF #include "fp_trunc_impl.inc" -COMPILER_RT_ABI _Float16 __trunctfhf2(long double a) { - return __truncXfYf2__(a); -} +COMPILER_RT_ABI dst_t __trunctfhf2(src_t a) { return __truncXfYf2__(a); } #endif diff --git a/compiler-rt/lib/builtins/trunctfsf2.c b/compiler-rt/lib/builtins/trunctfsf2.c index 0261b1e..b65b5af 100644 --- a/compiler-rt/lib/builtins/trunctfsf2.c +++ b/compiler-rt/lib/builtins/trunctfsf2.c @@ -9,11 +9,11 @@ #define QUAD_PRECISION #include "fp_lib.h" -#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) +#if defined(CRT_HAS_TF_MODE) #define SRC_QUAD #define DST_SINGLE #include "fp_trunc_impl.inc" -COMPILER_RT_ABI float __trunctfsf2(long double a) { return __truncXfYf2__(a); } +COMPILER_RT_ABI dst_t __trunctfsf2(src_t a) { return __truncXfYf2__(a); } #endif diff --git a/compiler-rt/lib/builtins/trunctfxf2.c b/compiler-rt/lib/builtins/trunctfxf2.c new file mode 100644 index 0000000..49bd32d --- /dev/null +++ b/compiler-rt/lib/builtins/trunctfxf2.c @@ -0,0 +1,23 @@ +//===-- lib/trunctfsf2.c - long double -> quad conversion ---------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Assumption: long double is a IEEE 80 bit floating point type padded to 128 +// bits. + +#define QUAD_PRECISION +#include "fp_lib.h" + +#if defined(CRT_HAS_TF_MODE) && __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) + +#define SRC_QUAD +#define DST_80 +#include "fp_trunc_impl.inc" + +COMPILER_RT_ABI xf_float __trunctfxf2(tf_float a) { return __truncXfYf2__(a); } + +#endif diff --git a/compiler-rt/lib/builtins/x86_64/chkstk2.S b/compiler-rt/lib/builtins/x86_64/chkstk2.S deleted file mode 100644 index 33d10d5..0000000 --- a/compiler-rt/lib/builtins/x86_64/chkstk2.S +++ /dev/null @@ -1,43 +0,0 @@ -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "../assembly.h" - -#ifdef __x86_64__ - -// _chkstk (_alloca) routine - probe stack between %rsp and (%rsp-%rax) in 4k increments, -// then decrement %rsp by %rax. Preserves all registers except %rsp and flags. -// This routine is windows specific -// http://msdn.microsoft.com/en-us/library/ms648426.aspx - -.text -.balign 4 -DEFINE_COMPILERRT_FUNCTION(__alloca) - mov %rcx,%rax // x64 _alloca is a normal function with parameter in rcx - // fallthrough -DEFINE_COMPILERRT_FUNCTION(___chkstk) - push %rcx - cmp $0x1000,%rax - lea 16(%rsp),%rcx // rsp before calling this routine -> rcx - jb 1f -2: - sub $0x1000,%rcx - test %rcx,(%rcx) - sub $0x1000,%rax - cmp $0x1000,%rax - ja 2b -1: - sub %rax,%rcx - test %rcx,(%rcx) - - lea 8(%rsp),%rax // load pointer to the return address into rax - mov %rcx,%rsp // install the new top of stack pointer into rsp - mov -8(%rax),%rcx // restore rcx - push (%rax) // push return address onto the stack - sub %rsp,%rax // restore the original value in rax - ret -END_COMPILERRT_FUNCTION(___chkstk) -END_COMPILERRT_FUNCTION(__alloca) - -#endif // __x86_64__ diff --git a/compiler-rt/lib/builtins/x86_64/floatdixf.c b/compiler-rt/lib/builtins/x86_64/floatdixf.c index cf8450c..54636e2 100644 --- a/compiler-rt/lib/builtins/x86_64/floatdixf.c +++ b/compiler-rt/lib/builtins/x86_64/floatdixf.c @@ -2,12 +2,12 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// long double __floatdixf(di_int a); +// xf_float __floatdixf(di_int a); #ifdef __x86_64__ #include "../int_lib.h" -long double __floatdixf(int64_t a) { return (long double)a; } +xf_float __floatdixf(int64_t a) { return (xf_float)a; } #endif // __i386__ diff --git a/compiler-rt/lib/builtins/x86_64/floatundixf.S b/compiler-rt/lib/builtins/x86_64/floatundixf.S index 9e3bced..cf7286f 100644 --- a/compiler-rt/lib/builtins/x86_64/floatundixf.S +++ b/compiler-rt/lib/builtins/x86_64/floatundixf.S @@ -4,7 +4,7 @@ #include "../assembly.h" -// long double __floatundixf(du_int a); +// xf_float __floatundixf(du_int a); #ifdef __x86_64__