diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2490c03..e78370a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,10 +8,15 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         submodules: recursive
-    - name: Install llvm 16
-      run: sudo apt-get purge --auto-remove llvm python3-lldb-14 llvm-14 && wget https://apt.llvm.org/llvm.sh && chmod +x llvm.sh && sudo ./llvm.sh 16
+    - name: Install llvm 18
+      run: |
+        sudo apt-get purge --auto-remove llvm python3-lldb-14 llvm-14
+        wget https://apt.llvm.org/llvm.sh
+        chmod +x llvm.sh
+        sudo ./llvm.sh 18
+        rm llvm.sh
     - name: Build
       run: make
diff --git a/Makefile b/Makefile
index 8217c31..7fdd8e4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,8 @@
-
-
-CC := clang-16
-LD := ld.lld-16
-OBJCOPY := llvm-objcopy-16
-AR := llvm-ar-16
-RANLIB := llvm-ranlib-16
+CC := clang-18
+LD := ld.lld-18
+OBJCOPY := llvm-objcopy-18
+AR := llvm-ar-18
+RANLIB := llvm-ranlib-18
 
 UNAME := $(shell uname)
 ifeq ($(UNAME), Darwin)
@@ -14,18 +12,19 @@ ifeq ($(UNAME), Darwin)
 	AR := llvm-ar
 endif
 
-CFLAGS := --target=riscv64  -march=rv64imc_zba_zbb_zbc_zbs -mabi=lp64 
+CFLAGS := --target=riscv64  -march=rv64imc_zba_zbb_zbc_zbs -mabi=lp64
 CFLAGS += -Os
 CFLAGS += -fdata-sections -ffunction-sections -fno-builtin -fvisibility=hidden -fomit-frame-pointer
 CFLAGS += -I compiler-rt/lib/builtins
 CFLAGS += -DVISIBILITY_HIDDEN -DCOMPILER_RT_HAS_FLOAT16
 
-RT_OBJ := build/fixunsdfdi.o \
+RT_OBJ := \
 build/absvdi2.o \
 build/absvsi2.o \
 build/absvti2.o \
 build/adddf3.o \
 build/addsf3.o \
+build/addtf3.o \
 build/addvdi3.o \
 build/addvsi3.o \
 build/addvti3.o \
@@ -36,6 +35,7 @@ build/ashrdi3.o \
 build/ashrti3.o \
 build/bswapdi2.o \
 build/bswapsi2.o \
+build/clear_cache.o \
 build/clzdi2.o \
 build/clzsi2.o \
 build/clzti2.o \
@@ -43,6 +43,9 @@ build/cmpdi2.o \
 build/cmpti2.o \
 build/comparedf2.o \
 build/comparesf2.o \
+build/comparetf2.o \
+build/crtbegin.o \
+build/crtend.o \
 build/ctzdi2.o \
 build/ctzsi2.o \
 build/ctzti2.o \
@@ -55,9 +58,16 @@ build/divmodti4.o \
 build/divsc3.o \
 build/divsf3.o \
 build/divsi3.o \
+build/divtc3.o \
+build/divtf3.o \
 build/divti3.o \
-build/extendsfdf2.o \
+build/extendbfsf2.o \
+build/extenddftf2.o \
 build/extendhfsf2.o \
+build/extendhftf2.o \
+build/extendsfdf2.o \
+build/extendsftf2.o \
+build/extendxftf2.o \
 build/ffsdi2.o \
 build/ffssi2.o \
 build/ffsti2.o \
@@ -67,25 +77,38 @@ build/fixdfti.o \
 build/fixsfdi.o \
 build/fixsfsi.o \
 build/fixsfti.o \
+build/fixtfdi.o \
+build/fixtfsi.o \
+build/fixtfti.o \
 build/fixunsdfdi.o \
 build/fixunsdfsi.o \
 build/fixunsdfti.o \
 build/fixunssfdi.o \
 build/fixunssfsi.o \
 build/fixunssfti.o \
+build/fixunstfdi.o \
+build/fixunstfsi.o \
+build/fixunstfti.o \
 build/floatdidf.o \
 build/floatdisf.o \
+build/floatditf.o \
 build/floatsidf.o \
 build/floatsisf.o \
+build/floatsitf.o \
 build/floattidf.o \
 build/floattisf.o \
+build/floattitf.o \
 build/floatundidf.o \
 build/floatundisf.o \
+build/floatunditf.o \
 build/floatunsidf.o \
 build/floatunsisf.o \
+build/floatunsitf.o \
 build/floatuntidf.o \
 build/floatuntisf.o \
+build/floatuntitf.o \
 build/fp_mode.o \
+build/gcc_personality_v0.o \
 build/int_util.o \
 build/lshrdi3.o \
 build/lshrti3.o \
@@ -100,6 +123,8 @@ build/mulosi4.o \
 build/muloti4.o \
 build/mulsc3.o \
 build/mulsf3.o \
+build/multc3.o \
+build/multf3.o \
 build/multi3.o \
 build/mulvdi3.o \
 build/mulvsi3.o \
@@ -120,15 +145,23 @@ build/popcountsi2.o \
 build/popcountti2.o \
 build/powidf2.o \
 build/powisf2.o \
+build/powitf2.o \
 build/subdf3.o \
 build/subsf3.o \
+build/subtf3.o \
 build/subvdi3.o \
 build/subvsi3.o \
 build/subvti3.o \
 build/trampoline_setup.o \
+build/truncdfbf2.o \
 build/truncdfhf2.o \
 build/truncdfsf2.o \
+build/truncsfbf2.o \
 build/truncsfhf2.o \
+build/trunctfdf2.o \
+build/trunctfhf2.o \
+build/trunctfsf2.o \
+build/trunctfxf2.o \
 build/ucmpdi2.o \
 build/ucmpti2.o \
 build/udivdi3.o \
@@ -139,38 +172,11 @@ build/udivsi3.o \
 build/udivti3.o \
 build/umoddi3.o \
 build/umodsi3.o \
-build/umodti3.o \
-build/addtf3.o \
-build/comparetf2.o \
-build/divtc3.o \
-build/divtf3.o \
-build/extenddftf2.o \
-build/extendhftf2.o \
-build/extendsftf2.o \
-build/fixtfdi.o \
-build/fixtfsi.o \
-build/fixtfti.o \
-build/fixunstfdi.o \
-build/fixunstfsi.o \
-build/fixunstfti.o \
-build/floatditf.o \
-build/floatsitf.o \
-build/floattitf.o \
-build/floatunditf.o \
-build/floatunsitf.o \
-build/floatuntitf.o \
-build/multc3.o \
-build/multf3.o \
-build/powitf2.o \
-build/subtf3.o \
-build/trunctfdf2.o \
-build/trunctfhf2.o \
-build/trunctfsf2.o 
-
-RISCV_OBJ := build/fp_mode.o build/muldi3.S.o
-# build/save.o \
-# build/restore.o\
+build/umodti3.o
 
+RISCV_OBJ := \
+	build/fp_mode.o \
+	build/muldi3.S.o
 
 all: build/libcompiler-rt.a
 
@@ -189,6 +195,6 @@ build/muldi3.S.o: compiler-rt/lib/builtins/riscv/muldi3.S
 	@echo build $<
 	@$(CC) $(CFLAGS) -c -o $@ $<
 
-clean: 
+clean:
 	rm -f build/*.o
 	rm -f build/*.a
diff --git a/README.md b/README.md
index 6d6e481..89d5c1f 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,8 @@ we possess the capability to construct it entirely from scratch.
 ## Build
 To build it, run `make`, the static library will be generated at `build/libcompiler-rt.a`.
 Then use following Makefile configuration:
-```
+
+```text
 LDFLAGS += -L./build -lcompiler-rt
 ```
 
@@ -15,11 +16,7 @@ LDFLAGS += -L./build -lcompiler-rt
 This project is **completely** from the llvm project:
 
 - Repo: `https://github.com/llvm/llvm-project`
-- Branch: `release/16.x`
-- Commit: `7cbf1a259`
-
-At the same time, we pulled the following commit from LLVM main branch to fix some bugs in clang-16:
-
-- <https://github.com/llvm/llvm-project/commit/ff14585eb02f>
+- Branch: `release/18.x`
+- Commit: `3b5b5c1`
 
-See more: https://github.com/llvm/llvm-project/blob/release/16.x/compiler-rt/lib/builtins/README.txt
+See more: https://github.com/llvm/llvm-project/blob/release/18.x/compiler-rt/lib/builtins/README.txt
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 2fc7052..13adbd6 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -3,14 +3,7 @@
 # architecture-specific code in various subdirectories.
 
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-  cmake_minimum_required(VERSION 3.13.4)
-  if ("${CMAKE_VERSION}" VERSION_LESS "3.20.0")
-    message(WARNING
-      "Your CMake version is ${CMAKE_VERSION}. Starting with LLVM 17.0.0, the "
-      "minimum version of CMake required to build LLVM will become 3.20.0, and "
-      "using an older CMake will become an error. Please upgrade your CMake to "
-      "at least 3.20.0 now to avoid issues in the future!")
-  endif()
+  cmake_minimum_required(VERSION 3.20.0)
 
   set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
   project(CompilerRTBuiltins C ASM)
@@ -45,6 +38,13 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     include(UseLibtool)
   endif()
   include(AddCompilerRT)
+
+  if(MINGW)
+    # Simplified version of what's set in cmake/config-ix.cmake; not including
+    # builtins, which are linked separately.
+    set(MINGW_LIBRARIES mingw32 moldname mingwex msvcrt advapi32 shell32
+                        user32 kernel32 mingw32 moldname mingwex msvcrt)
+   endif()
 endif()
 
 if (COMPILER_RT_STANDALONE_BUILD)
@@ -58,12 +58,9 @@ if (COMPILER_RT_STANDALONE_BUILD)
 endif()
 
 include(builtin-config-ix)
+include(CMakeDependentOption)
 include(CMakePushCheckState)
 
-if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-  include(CompilerRTAIXUtils)
-endif()
-
 option(COMPILER_RT_BUILTINS_HIDE_SYMBOLS
   "Do not export any symbols from the static library." ON)
 
@@ -193,12 +190,11 @@ set(GENERIC_SOURCES
 
 # We only build BF16 files when "__bf16" is available.
 set(BF16_SOURCES
+    extendbfsf2.c
     truncdfbf2.c
     truncsfbf2.c
 )
 
-# TODO: Several "tf" files (and divtc3.c, but not multc3.c) are in
-# GENERIC_SOURCES instead of here.
 set(GENERIC_TF_SOURCES
   addtf3.c
   comparetf2.c
@@ -232,7 +228,7 @@ option(COMPILER_RT_EXCLUDE_ATOMIC_BUILTIN
   "Skip the atomic builtin (these should normally be provided by a shared library)"
   On)
 
-if(NOT FUCHSIA AND NOT COMPILER_RT_BAREMETAL_BUILD)
+if(NOT FUCHSIA AND NOT COMPILER_RT_BAREMETAL_BUILD AND NOT COMPILER_RT_GPU_BUILD)
   set(GENERIC_SOURCES
     ${GENERIC_SOURCES}
     emutls.c
@@ -241,6 +237,14 @@ if(NOT FUCHSIA AND NOT COMPILER_RT_BAREMETAL_BUILD)
   )
 endif()
 
+option(COMPILER_RT_LIBATOMIC_USE_PTHREAD
+  "Whether libatomic should use pthreads if available."
+  Off)
+
+if(COMPILER_RT_LIBATOMIC_USE_PTHREAD)
+  add_compile_definitions(_LIBATOMIC_USE_PTHREAD)
+endif()
+
 if(COMPILER_RT_HAS_ATOMIC_KEYWORD AND NOT COMPILER_RT_EXCLUDE_ATOMIC_BUILTIN)
   set(GENERIC_SOURCES
     ${GENERIC_SOURCES}
@@ -276,7 +280,7 @@ endif()
 
 # These files are used on 32-bit and 64-bit x86.
 set(x86_ARCH_SOURCES
-  cpu_model.c
+  cpu_model/x86.c
   )
 
 if (NOT MSVC)
@@ -290,6 +294,7 @@ endif ()
 # long double is not 80 bits on Android or MSVC.
 set(x86_80_BIT_SOURCES
   divxc3.c
+  extendxftf2.c
   fixxfdi.c
   fixxfti.c
   fixunsxfdi.c
@@ -301,6 +306,7 @@ set(x86_80_BIT_SOURCES
   floatuntixf.c
   mulxc3.c
   powixf2.c
+  trunctfxf2.c
 )
 
 if (NOT MSVC)
@@ -310,17 +316,27 @@ if (NOT MSVC)
     ${x86_ARCH_SOURCES}
     x86_64/floatdidf.c
     x86_64/floatdisf.c
-    x86_64/floatundidf.S
-    x86_64/floatundisf.S
   )
+  if (NOT WIN32)
+    set(x86_64_SOURCES
+      ${x86_64_SOURCES}
+      x86_64/floatundidf.S
+      x86_64/floatundisf.S
+    )
+  endif()
 
   if (NOT ANDROID)
     set(x86_64_SOURCES
       ${x86_64_SOURCES}
       ${x86_80_BIT_SOURCES}
       x86_64/floatdixf.c
-      x86_64/floatundixf.S
     )
+    if (NOT WIN32)
+      set(x86_64_SOURCES
+        ${x86_64_SOURCES}
+        x86_64/floatundixf.S
+      )
+    endif()
   endif()
 
   # Darwin x86_64 Haswell
@@ -330,7 +346,6 @@ if (NOT MSVC)
     set(x86_64_SOURCES
       ${x86_64_SOURCES}
       x86_64/chkstk.S
-      x86_64/chkstk2.S
     )
   endif()
 
@@ -364,7 +379,6 @@ if (NOT MSVC)
     set(i386_SOURCES
       ${i386_SOURCES}
       i386/chkstk.S
-      i386/chkstk2.S
     )
   endif()
 else () # MSVC
@@ -551,10 +565,29 @@ endif()
 set(aarch64_SOURCES
   ${GENERIC_TF_SOURCES}
   ${GENERIC_SOURCES}
-  cpu_model.c
+  cpu_model/aarch64.c
   aarch64/fp_mode.c
 )
 
+if (COMPILER_RT_HAS_AARCH64_SME)
+  if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+    list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c)
+    message(STATUS "AArch64 SME ABI routines enabled")
+    set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
+  else()
+    if(COMPILER_RT_DISABLE_AARCH64_FMV)
+      message(WARNING "AArch64 SME ABI routines require function multiversioning support.")
+    endif()
+    if(NOT COMPILER_RT_HAS_FNO_BUILTIN_FLAG)
+      message(WARNING "AArch64 SME ABI routines require '-fno-builtin'")
+    endif()
+    if(NOT (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+      message(WARNING "AArch64 SME ABI routines requires sys/auxv.h or COMPILER_RT_BAREMETAL_BUILD flag")
+    endif()
+    message(STATUS "AArch64 SME ABI routines disabled")
+  endif()
+endif()
+
 # Generate outline atomics helpers from lse.S base
 set(OA_HELPERS_DIR "${CMAKE_CURRENT_BINARY_DIR}/outline_atomic_helpers.dir")
 file(MAKE_DIRECTORY "${OA_HELPERS_DIR}")
@@ -567,11 +600,15 @@ endif()
 
 foreach(pat cas swp ldadd ldclr ldeor ldset)
   foreach(size 1 2 4 8 16)
-    foreach(model 1 2 3 4)
+    foreach(model 1 2 3 4 5)
       if(pat STREQUAL "cas" OR NOT size STREQUAL "16")
+        set(source_asm "${CMAKE_CURRENT_SOURCE_DIR}/aarch64/lse.S")
         set(helper_asm "${OA_HELPERS_DIR}/outline_atomic_${pat}${size}_${model}.S")
-        list(APPEND lse_builtins "${helper_asm}")
-        list(APPEND arm64_lse_commands COMMAND ${CMAKE_COMMAND} -E ${COMPILER_RT_LINK_OR_COPY} "${CMAKE_CURRENT_SOURCE_DIR}/aarch64/lse.S" "${helper_asm}")
+        add_custom_command(
+          OUTPUT "${helper_asm}"
+          COMMAND ${CMAKE_COMMAND} -E ${COMPILER_RT_LINK_OR_COPY} "${source_asm}" "${helper_asm}"
+          DEPENDS "${source_asm}"
+        )
         set_source_files_properties("${helper_asm}"
           PROPERTIES
           COMPILE_DEFINITIONS "L_${pat};SIZE=${size};MODEL=${model}"
@@ -590,6 +627,8 @@ if (MINGW)
   )
 endif()
 
+set(amdgcn_SOURCES ${GENERIC_SOURCES})
+
 set(armv4t_SOURCES ${arm_min_SOURCES})
 set(armv5te_SOURCES ${arm_min_SOURCES})
 set(armv6_SOURCES ${arm_min_SOURCES})
@@ -605,6 +644,7 @@ set(arm64_32_SOURCES ${aarch64_SOURCES})
 set(armv6m_SOURCES ${thumb1_SOURCES})
 set(armv7m_SOURCES ${arm_SOURCES})
 set(armv7em_SOURCES ${arm_SOURCES})
+set(armv8m.base_SOURCES ${thumb1_SOURCES})
 set(armv8m.main_SOURCES ${arm_SOURCES})
 set(armv8.1m.main_SOURCES ${arm_SOURCES})
 
@@ -668,6 +708,8 @@ set(mips64_SOURCES ${GENERIC_TF_SOURCES}
 set(mips64el_SOURCES ${GENERIC_TF_SOURCES}
                      ${mips_SOURCES})
 
+set(nvptx64_SOURCES ${GENERIC_SOURCES})
+
 set(powerpc_SOURCES ${GENERIC_SOURCES})
 
 set(powerpcspe_SOURCES ${GENERIC_SOURCES})
@@ -686,7 +728,7 @@ set(powerpc64_SOURCES
   ${GENERIC_SOURCES}
 )
 # These routines require __int128, which isn't supported on AIX.
-if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+if (NOT OS_NAME MATCHES "AIX")
   set(powerpc64_SOURCES
     ppc/floattitf.c
     ppc/fixtfti.c
@@ -697,6 +739,7 @@ endif()
 set(powerpc64le_SOURCES ${powerpc64_SOURCES})
 
 set(riscv_SOURCES
+  cpu_model/riscv.c
   riscv/fp_mode.c
   riscv/save.S
   riscv/restore.S
@@ -731,7 +774,11 @@ set(ve_SOURCES
   ${GENERIC_SOURCES})
 
 add_custom_target(builtins)
-set_target_properties(builtins PROPERTIES FOLDER "Compiler-RT Misc")
+set_target_properties(builtins PROPERTIES FOLDER "Compiler-RT/Metatargets")
+
+option(COMPILER_RT_ENABLE_SOFTWARE_INT128
+  "Enable the int128 builtin routines for all targets."
+  OFF)
 
 if (APPLE)
   add_subdirectory(Darwin-excludes)
@@ -746,6 +793,13 @@ else ()
   endif()
 
   append_list_if(COMPILER_RT_HAS_STD_C11_FLAG -std=c11 BUILTIN_CFLAGS)
+  append_list_if(COMPILER_RT_HAS_WBUILTIN_DECLARATION_MISMATCH_FLAG -Werror=builtin-declaration-mismatch BUILTIN_CFLAGS)
+
+  # Don't embed directives for picking any specific CRT
+  if (MSVC)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "")
+    append_list_if(COMPILER_RT_HAS_ZL_FLAG /Zl BUILTIN_CFLAGS)
+  endif()
 
   # These flags would normally be added to CMAKE_C_FLAGS by the llvm
   # cmake step. Add them manually if this is a standalone build.
@@ -762,6 +816,21 @@ else ()
     endif()
   endif()
 
+  # Directly targeting the GPU requires a few extra flags.
+  if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx")
+    append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding BUILTIN_CFLAGS)
+    append_list_if(COMPILER_RT_HAS_NOGPULIB_FLAG -nogpulib BUILTIN_CFLAGS)
+    append_list_if(COMPILER_RT_HAS_FLTO_FLAG -flto BUILTIN_CFLAGS)
+    append_list_if(COMPILER_RT_HAS_FCONVERGENT_FUNCTIONS_FLAG
+                   -fconvergent-functions BUILTIN_CFLAGS)
+
+    # AMDGPU targets want to use a generic ABI.
+    if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn")
+      append_list_if(COMPILER_RT_HAS_CODE_OBJECT_VERSION_FLAG
+                     "SHELL:-Xclang -mcode-object-version=none" BUILTIN_CFLAGS)
+    endif()
+  endif()
+
   set(BUILTIN_DEFS "")
 
   if(COMPILER_RT_BUILTINS_HIDE_SYMBOLS)
@@ -820,20 +889,10 @@ else ()
 
       # For RISCV32, we must force enable int128 for compiling long
       # double routines.
-      if("${arch}" STREQUAL "riscv32")
+      if(COMPILER_RT_ENABLE_SOFTWARE_INT128 OR "${arch}" STREQUAL "riscv32")
         list(APPEND BUILTIN_CFLAGS_${arch} -fforce-enable-int128)
       endif()
 
-      if(arch STREQUAL "aarch64")
-        add_custom_target(
-          lse_builtin_symlinks
-          BYPRODUCTS ${lse_builtins}
-          ${arm64_lse_commands}
-        )
-
-        set(deps_aarch64 lse_builtin_symlinks)
-      endif()
-
       add_compiler_rt_runtime(clang_rt.builtins
                               STATIC
                               ARCHS ${arch}
@@ -847,41 +906,44 @@ else ()
   endforeach ()
 endif ()
 
+add_dependencies(compiler-rt builtins)
+
 option(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC
   "Build standalone shared atomic library."
   OFF)
 
 if(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC)
   add_custom_target(builtins-standalone-atomic)
-  set(BUILTIN_DEPS "")
   set(BUILTIN_TYPE SHARED)
-  if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  if(OS_NAME MATCHES "AIX")
+    include(CompilerRTAIXUtils)
     if(NOT COMPILER_RT_LIBATOMIC_LINK_FLAGS)
       get_aix_libatomic_default_link_flags(COMPILER_RT_LIBATOMIC_LINK_FLAGS
         "${CMAKE_CURRENT_SOURCE_DIR}/ppc/atomic.exp")
     endif()
-    # The compiler needs builtins to link any other binaries, so let
-    # clang_rt.atomic be built after builtins.
-    set(BUILTIN_DEPS builtins)
     # For different versions of cmake, SHARED behaves differently. For some
     # versions, we might need MODULE rather than SHARED.
     get_aix_libatomic_type(BUILTIN_TYPE)
+  else()
+    list(APPEND COMPILER_RT_LIBATOMIC_LINK_FLAGS -nodefaultlibs)
   endif()
   foreach (arch ${BUILTIN_SUPPORTED_ARCH})
     if(CAN_TARGET_${arch})
+      list(APPEND COMPILER_RT_LIBATOMIC_LINK_LIBS_${arch} clang_rt.builtins-${arch})
+      append_list_if(MINGW "${MINGW_LIBRARIES}" COMPILER_RT_LIBATOMIC_LINK_LIBS_${arch})
       add_compiler_rt_runtime(clang_rt.atomic
                               ${BUILTIN_TYPE}
                               ARCHS ${arch}
                               SOURCES atomic.c
                               LINK_FLAGS ${COMPILER_RT_LIBATOMIC_LINK_FLAGS}
-                              DEPS ${BUILTIN_DEPS}
+                              LINK_LIBS ${COMPILER_RT_LIBATOMIC_LINK_LIBS_${arch}}
                               PARENT_TARGET builtins-standalone-atomic)
     endif()
   endforeach()
   # FIXME: On AIX, we have to archive built shared libraries into a static
   # archive, i.e., libatomic.a. Once cmake adds support of such usage for AIX,
   # this ad-hoc part can be removed.
-  if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  if(OS_NAME MATCHES "AIX")
     archive_aix_libatomic(clang_rt.atomic libatomic
                           ARCHS ${BUILTIN_SUPPORTED_ARCH}
                           PARENT_TARGET builtins-standalone-atomic)
@@ -889,4 +951,40 @@ if(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC)
   add_dependencies(compiler-rt builtins-standalone-atomic)
 endif()
 
-add_dependencies(compiler-rt builtins)
+cmake_dependent_option(COMPILER_RT_BUILD_CRT "Build crtbegin.o/crtend.o" ON "COMPILER_RT_HAS_CRT" OFF)
+
+if (COMPILER_RT_BUILD_CRT)
+  add_compiler_rt_component(crt)
+
+  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" ON)
+
+  include(CheckSectionExists)
+  check_section_exists(".init_array" COMPILER_RT_HAS_INITFINI_ARRAY
+    SOURCE "volatile int x;\n__attribute__((constructor)) void f(void) {x = 0;}\nint main(void) { return 0; }\n")
+
+  append_list_if(COMPILER_RT_HAS_STD_C11_FLAG -std=c11 CRT_CFLAGS)
+  append_list_if(COMPILER_RT_HAS_INITFINI_ARRAY -DCRT_HAS_INITFINI_ARRAY CRT_CFLAGS)
+  append_list_if(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY -DEH_USE_FRAME_REGISTRY CRT_CFLAGS)
+  append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC CRT_CFLAGS)
+  append_list_if(COMPILER_RT_HAS_WNO_PEDANTIC -Wno-pedantic CRT_CFLAGS)
+  if (COMPILER_RT_HAS_FCF_PROTECTION_FLAG)
+    append_list_if(COMPILER_RT_ENABLE_CET -fcf-protection=full CRT_CFLAGS)
+  endif()
+
+  foreach(arch ${BUILTIN_SUPPORTED_ARCH})
+    add_compiler_rt_runtime(clang_rt.crtbegin
+      OBJECT
+      ARCHS ${arch}
+      SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/crtbegin.c
+      CFLAGS ${CRT_CFLAGS}
+      PARENT_TARGET crt)
+    add_compiler_rt_runtime(clang_rt.crtend
+      OBJECT
+      ARCHS ${arch}
+      SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/crtend.c
+      CFLAGS ${CRT_CFLAGS}
+      PARENT_TARGET crt)
+  endforeach()
+
+  add_dependencies(compiler-rt crt)
+endif()
diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt
index 53d656d..19f26c9 100644
--- a/compiler-rt/lib/builtins/README.txt
+++ b/compiler-rt/lib/builtins/README.txt
@@ -35,13 +35,13 @@ typedef uint64_t du_int;
 
 // Integral bit manipulation
 
-di_int __ashldi3(di_int a, si_int b);      // a << b
-ti_int __ashlti3(ti_int a, si_int b);      // a << b
+di_int __ashldi3(di_int a, int b);         // a << b
+ti_int __ashlti3(ti_int a, int b);         // a << b
 
-di_int __ashrdi3(di_int a, si_int b);      // a >> b  arithmetic (sign fill)
-ti_int __ashrti3(ti_int a, si_int b);      // a >> b  arithmetic (sign fill)
-di_int __lshrdi3(di_int a, si_int b);      // a >> b  logical    (zero fill)
-ti_int __lshrti3(ti_int a, si_int b);      // a >> b  logical    (zero fill)
+di_int __ashrdi3(di_int a, int b);         // a >> b  arithmetic (sign fill)
+ti_int __ashrti3(ti_int a, int b);         // a >> b  arithmetic (sign fill)
+di_int __lshrdi3(di_int a, int b);         // a >> b  logical    (zero fill)
+ti_int __lshrti3(ti_int a, int b);         // a >> b  logical    (zero fill)
 
 int __clzsi2(si_int a);  // count leading zeros
 int __clzdi2(di_int a);  // count leading zeros
@@ -137,49 +137,54 @@ si_int __ucmpti2(tu_int a, tu_int b);
 di_int __fixsfdi(      float a);
 di_int __fixdfdi(     double a);
 di_int __fixxfdi(long double a);
+di_int __fixtfdi(   tf_float a);
 
 ti_int __fixsfti(      float a);
 ti_int __fixdfti(     double a);
 ti_int __fixxfti(long double a);
-uint64_t __fixtfdi(long double input);  // ppc only, doesn't match documentation
+ti_int __fixtfti(   tf_float a);
 
 su_int __fixunssfsi(      float a);
 su_int __fixunsdfsi(     double a);
 su_int __fixunsxfsi(long double a);
+su_int __fixunstfsi(   tf_float a);
 
 du_int __fixunssfdi(      float a);
 du_int __fixunsdfdi(     double a);
 du_int __fixunsxfdi(long double a);
+du_int __fixunstfdi(   tf_float a);
 
 tu_int __fixunssfti(      float a);
 tu_int __fixunsdfti(     double a);
 tu_int __fixunsxfti(long double a);
-uint64_t __fixunstfdi(long double input);  // ppc only
+tu_int __fixunstfti(   tf_float a);
 
 float       __floatdisf(di_int a);
 double      __floatdidf(di_int a);
 long double __floatdixf(di_int a);
-long double __floatditf(int64_t a);        // ppc only
+tf_float    __floatditf(int64_t a);
 
 float       __floattisf(ti_int a);
 double      __floattidf(ti_int a);
 long double __floattixf(ti_int a);
+tf_float    __floattitf(ti_int a);
 
 float       __floatundisf(du_int a);
 double      __floatundidf(du_int a);
 long double __floatundixf(du_int a);
-long double __floatunditf(uint64_t a);     // ppc only
+tf_float    __floatunditf(du_int a);
 
 float       __floatuntisf(tu_int a);
 double      __floatuntidf(tu_int a);
 long double __floatuntixf(tu_int a);
+tf_float    __floatuntixf(tu_int a);
 
 //  Floating point raised to integer power
 
 float       __powisf2(      float a, int b);  // a ^ b
 double      __powidf2(     double a, int b);  // a ^ b
 long double __powixf2(long double a, int b);  // a ^ b
-long double __powitf2(long double a, int b);  // ppc only, a ^ b
+tf_float    __powitf2(   tf_float a, int b);  // a ^ b
 
 //  Complex arithmetic
 
@@ -189,8 +194,7 @@ long double __powitf2(long double a, int b);  // ppc only, a ^ b
      double _Complex __muldc3(double a, double b, double c, double d);
 long double _Complex __mulxc3(long double a, long double b,
                               long double c, long double d);
-long double _Complex __multc3(long double a, long double b,
-                              long double c, long double d); // ppc only
+   tf_float _Complex __multc3(tf_float a, tf_float b, tf_float c, tf_float d);
 
 //  (a + ib) / (c + id)
 
@@ -198,8 +202,7 @@ long double _Complex __multc3(long double a, long double b,
      double _Complex __divdc3(double a, double b, double c, double d);
 long double _Complex __divxc3(long double a, long double b,
                               long double c, long double d);
-long double _Complex __divtc3(long double a, long double b,
-                              long double c, long double d);  // ppc only
+   tf_float _Complex __divtc3(tf_float a, tf_float b, tf_float c, tf_float d);
 
 
 //         Runtime support
@@ -269,6 +272,11 @@ switch32
 switch8
 switchu8
 
+// This function generates a custom trampoline function with the specific
+// realFunc and localsPtr values.
+void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
+                        const void* realFunc, void* localsPtr);
+
 // There is no C interface to the *_vfp_d8_d15_regs functions.  There are
 // called in the prolog and epilog of Thumb1 functions.  When the C++ ABI use
 // SJLJ for exceptions, each function with a catch clause or destructors needs
diff --git a/compiler-rt/lib/builtins/aarch64/lse.S b/compiler-rt/lib/builtins/aarch64/lse.S
index 5dc0d53..1fe18f4 100644
--- a/compiler-rt/lib/builtins/aarch64/lse.S
+++ b/compiler-rt/lib/builtins/aarch64/lse.S
@@ -7,7 +7,7 @@
 // Out-of-line LSE atomics helpers. Ported from libgcc library.
 // N = {1, 2, 4, 8}
 // M = {1, 2, 4, 8, 16}
-// ORDER = {'relax', 'acq', 'rel', 'acq_rel'}
+// ORDER = {'relax', 'acq', 'rel', 'acq_rel', 'sync'}
 // Routines implemented:
 //
 //  iM __aarch64_casM_ORDER(iM expected, iM desired, iM *ptr)
@@ -35,8 +35,8 @@ HIDDEN(___aarch64_have_lse_atomics)
 #endif
 
 // Generate mnemonics for
-// L_cas:                                 SIZE: 1,2,4,8,16 MODEL: 1,2,3,4
-// L_swp L_ldadd L_ldclr L_ldeor L_ldset: SIZE: 1,2,4,8    MODEL: 1,2,3,4
+// L_cas:                                 SIZE: 1,2,4,8,16 MODEL: 1,2,3,4,5
+// L_swp L_ldadd L_ldclr L_ldeor L_ldset: SIZE: 1,2,4,8    MODEL: 1,2,3,4,5
 
 #if SIZE == 1
 #define S b
@@ -64,24 +64,44 @@ HIDDEN(___aarch64_have_lse_atomics)
 #define L
 #define M 0x000000
 #define N 0x000000
+#define BARRIER
 #elif MODEL == 2
 #define SUFF _acq
 #define A a
 #define L
 #define M 0x400000
 #define N 0x800000
+#define BARRIER
 #elif MODEL == 3
 #define SUFF _rel
 #define A
 #define L l
 #define M 0x008000
 #define N 0x400000
+#define BARRIER
 #elif MODEL == 4
 #define SUFF _acq_rel
 #define A a
 #define L l
 #define M 0x408000
 #define N 0xc00000
+#define BARRIER
+#elif MODEL == 5
+#define SUFF _sync
+#ifdef L_swp
+// swp has _acq semantics.
+#define A a
+#define L
+#define M 0x400000
+#define N 0x800000
+#else
+// All other _sync functions have _seq semantics.
+#define A a
+#define L l
+#define M 0x408000
+#define N 0xc00000
+#endif
+#define BARRIER dmb ish
 #else
 #error
 #endif // MODEL
@@ -96,7 +116,12 @@ HIDDEN(___aarch64_have_lse_atomics)
 #endif
 
 #define NAME(BASE) GLUE4(__aarch64_, BASE, SIZE, SUFF)
+#if MODEL == 5
+// Drop A for _sync functions.
+#define LDXR GLUE3(ld, xr, S)
+#else
 #define LDXR GLUE4(ld, A, xr, S)
+#endif
 #define STXR GLUE4(st, L, xr, S)
 
 // Define temporary registers.
@@ -136,9 +161,15 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(cas))
         STXR   w(tmp1), s(1), [x2]
         cbnz   w(tmp1), 0b
 1:
+        BARRIER
         ret
 #else
+#if MODEL == 5
+// Drop A for _sync functions.
+#define LDXP GLUE2(ld, xp)
+#else
 #define LDXP GLUE3(ld, A, xp)
+#endif
 #define STXP GLUE3(st, L, xp)
 #ifdef HAS_ASM_LSE
 #define CASP GLUE3(casp, A, L)  x0, x1, x2, x3, [x4]
@@ -159,6 +190,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(cas))
         STXP   w(tmp2), x2, x3, [x4]
         cbnz   w(tmp2), 0b
 1:
+        BARRIER
         ret
 #endif
 END_COMPILERRT_OUTLINE_FUNCTION(NAME(cas))
@@ -180,6 +212,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(swp))
         LDXR   s(0), [x1]
         STXR   w(tmp1), s(tmp0), [x1]
         cbnz   w(tmp1), 0b
+        BARRIER
         ret
 END_COMPILERRT_OUTLINE_FUNCTION(NAME(swp))
 #endif // L_swp
@@ -224,6 +257,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(NAME(LDNM))
         OP     s(tmp1), s(0), s(tmp0)
         STXR   w(tmp2), s(tmp1), [x1]
         cbnz   w(tmp2), 0b
+        BARRIER
         ret
 END_COMPILERRT_OUTLINE_FUNCTION(NAME(LDNM))
 #endif // L_ldadd L_ldclr L_ldeor L_ldset
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-init.c b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c
new file mode 100644
index 0000000..b6ee121
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c
@@ -0,0 +1,52 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+__attribute__((visibility("hidden"), nocommon))
+_Bool __aarch64_has_sme_and_tpidr2_el0;
+
+// We have multiple ways to check that the function has SME, depending on our
+// target.
+// * For Linux we can use __getauxval().
+// * For newlib we can use __aarch64_sme_accessible().
+
+#if defined(__linux__)
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#ifndef HWCAP2_SME
+#define HWCAP2_SME (1 << 23)
+#endif
+
+extern unsigned long int __getauxval (unsigned long int);
+
+static _Bool has_sme(void) {
+  return __getauxval(AT_HWCAP2) & HWCAP2_SME;
+}
+
+#else  // defined(__linux__)
+
+#if defined(COMPILER_RT_SHARED_LIB)
+__attribute__((weak))
+#endif
+extern _Bool __aarch64_sme_accessible(void);
+
+static _Bool has_sme(void)  {
+#if defined(COMPILER_RT_SHARED_LIB)
+  if (!__aarch64_sme_accessible)
+    return 0;
+#endif
+  return __aarch64_sme_accessible();
+}
+
+#endif // defined(__linux__)
+
+#if __GNUC__ >= 9
+#pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
+#endif
+__attribute__((constructor(90)))
+static void init_aarch64_has_sme(void) {
+  __aarch64_has_sme_and_tpidr2_el0 = has_sme();
+}
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
new file mode 100644
index 0000000..4b9ee8c
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-vg.c
@@ -0,0 +1,18 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "../cpu_model/aarch64.h"
+
+struct FEATURES {
+  unsigned long long features;
+};
+
+extern struct FEATURES __aarch64_cpu_features;
+
+CONSTRUCTOR_ATTRIBUTE static void get_aarch64_cpu_features(void) {
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  __init_cpu_features();
+}
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
new file mode 100644
index 0000000..3e9bd2c
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -0,0 +1,230 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This patch implements the support routines for the SME ABI,
+// described here:
+//  https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines
+
+#include "../assembly.h"
+
+
+#if !defined(__APPLE__)
+#define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
+#define TPIDR2_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
+#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)
+#define CPU_FEATS_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_cpu_features)
+#else
+// MachO requires @page/@pageoff directives because the global is defined
+// in a different file. Otherwise this file may fail to build.
+#define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@page
+#define TPIDR2_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@pageoff
+#define CPU_FEATS_SYMBOL SYMBOL_NAME(__aarch64_cpu_features)@page
+#define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
+#endif
+
+.arch armv9-a+sme
+
+// Utility function which calls a system's abort() routine. Because the function
+// is streaming-compatible it should disable streaming-SVE mode before calling
+// abort(). Note that there is no need to preserve any state before the call,
+// because the function does not return.
+DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
+  .cfi_startproc
+  .variant_pcs SYMBOL_NAME(do_abort)
+  BTI_C
+  stp  x29, x30, [sp, #-32]!
+  cntd x0
+  // Store VG to a stack location that we describe with .cfi_offset
+  str x0, [sp, #16]
+  .cfi_def_cfa_offset 32
+  .cfi_offset w30, -24
+  .cfi_offset w29, -32
+  .cfi_offset 46, -16
+  bl  __arm_sme_state
+  tbz  x0, #0, 2f
+1:
+  smstop sm
+2:
+  // We can't make this into a tail-call because the unwinder would
+  // need to restore the value of VG.
+  bl  SYMBOL_NAME(abort)
+  .cfi_endproc
+END_COMPILERRT_FUNCTION(do_abort)
+
+// __arm_sme_state fills the result registers based on a local
+// that is set as part of the compiler-rt startup code.
+//   __aarch64_has_sme_and_tpidr2_el0
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
+  .variant_pcs __arm_sme_state
+  BTI_C
+  mov x0, xzr
+  mov x1, xzr
+
+  adrp  x16, TPIDR2_SYMBOL
+  ldrb w16, [x16, TPIDR2_SYMBOL_OFFSET]
+  cbz w16, 1f
+0:
+  orr x0, x0, #0xC000000000000000
+  mrs x16, SVCR
+  bfxil x0, x16, #0, #2
+  mrs x1, TPIDR2_EL0
+1:
+  ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state)
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
+  .variant_pcs __arm_tpidr2_restore
+  BTI_C
+  // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific
+  // manner.
+  mrs x14, TPIDR2_EL0
+  cbnz  x14, 2f
+
+  // If any of the reserved bytes in the first 16 bytes of BLK are nonzero,
+  // the subroutine [..] aborts in some platform-defined manner.
+  ldrh  w14, [x0, #10]
+  cbnz  w14, 2f
+  ldr w14, [x0, #12]
+  cbnz  w14, 2f
+
+  // If BLK.za_save_buffer is NULL, the subroutine does nothing.
+  ldr x16, [x0]
+  cbz x16, 1f
+
+  // If BLK.num_za_save_slices is zero, the subroutine does nothing.
+  ldrh  w14, [x0, #8]
+  cbz x14, 1f
+
+  mov x15, xzr
+0:
+  ldr za[w15,0], [x16]
+  addsvl x16, x16, #1
+  add x15, x15, #1
+  cmp x14, x15
+  b.ne  0b
+1:
+  ret
+2:
+  b  SYMBOL_NAME(do_abort)
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore)
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
+  .variant_pcs __arm_tpidr2_save
+  BTI_C
+  // If the current thread does not have access to TPIDR2_EL0, the subroutine
+  // does nothing.
+  adrp  x14, TPIDR2_SYMBOL
+  ldrb w14, [x14, TPIDR2_SYMBOL_OFFSET]
+  cbz w14, 1f
+
+  // If TPIDR2_EL0 is null, the subroutine does nothing.
+  mrs x16, TPIDR2_EL0
+  cbz x16, 1f
+
+  // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are
+  // nonzero, the subroutine [..] aborts in some platform-defined manner.
+  ldrh  w14, [x16, #10]
+  cbnz  w14, 2f
+  ldr w14, [x16, #12]
+  cbnz  w14, 2f
+
+  // If num_za_save_slices is zero, the subroutine does nothing.
+  ldrh  w14, [x16, #8]
+  cbz x14, 1f
+
+  // If za_save_buffer is NULL, the subroutine does nothing.
+  ldr x16, [x16]
+  cbz x16, 1f
+
+  mov x15, xzr
+0:
+  str za[w15,0], [x16]
+  addsvl x16, x16, #1
+  add x15, x15, #1
+  cmp x14, x15
+  b.ne  0b
+1:
+  ret
+2:
+  b  SYMBOL_NAME(do_abort)
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save)
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
+  .variant_pcs __arm_za_disable
+  BTI_C
+  // If the current thread does not have access to SME, the subroutine does
+  // nothing.
+  adrp  x14, TPIDR2_SYMBOL
+  ldrb w14, [x14, TPIDR2_SYMBOL_OFFSET]
+  cbz w14, 0f
+
+  // Otherwise, the subroutine behaves as if it did the following:
+  // * Call __arm_tpidr2_save.
+  stp x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+  bl  __arm_tpidr2_save
+
+  // * Set TPIDR2_EL0 to null.
+  msr TPIDR2_EL0, xzr
+
+  // * Set PSTATE.ZA to 0.
+  smstop za
+
+  .cfi_def_cfa wsp, 16
+  ldp x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+0:
+  ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable)
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
+  .variant_pcs __arm_get_current_vg
+  BTI_C
+
+  stp     x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov     x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     w17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbnz    w17, #30, 0f
+  adrp    x16, TPIDR2_SYMBOL
+  ldrb    w16, [x16, TPIDR2_SYMBOL_OFFSET]
+  cbz     w16, 1f
+0:
+  mov     x18, x1
+  bl      __arm_sme_state
+  mov     x1, x18
+  and     x17, x17, #0x40000000
+  bfxil   x17, x0, #0, #1
+  cbz     x17, 1f
+  cntd    x0
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+1:
+  mov     x0, xzr
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg)
+
+NO_EXEC_STACK_DIRECTIVE
+
+// GNU property note for BTI and PAC
+GNU_PROPERTY_BTI_PAC
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
new file mode 100644
index 0000000..0318d9a
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -0,0 +1,352 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Routines taken from libc/AOR_v20.02/string/aarch64
+
+#include "../assembly.h"
+
+#ifdef __aarch64__
+
+#define L(l) .L ## l
+
+//
+//  __arm_sc_memcpy / __arm_sc_memmove
+//
+
+#define dstin    x0
+#define src      x1
+#define count    x2
+#define dst      x3
+#define srcend1  x4
+#define dstend1  x5
+#define A_l      x6
+#define A_lw     w6
+#define A_h      x7
+#define B_l      x8
+#define B_lw     w8
+#define B_h      x9
+#define C_l      x10
+#define C_lw     w10
+#define C_h      x11
+#define D_l      x12
+#define D_h      x13
+#define E_l      x14
+#define E_h      x15
+#define F_l      x16
+#define F_h      x17
+#define G_l      count
+#define G_h      dst
+#define H_l      src
+#define H_h      srcend1
+#define tmp1     x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
+        add     srcend1, src, count
+        add     dstend1, dstin, count
+        cmp     count, 128
+        b.hi    L(copy_long)
+        cmp     count, 32
+        b.hi    L(copy32_128)
+
+        /* Small copies: 0..32 bytes.  */
+        cmp     count, 16
+        b.lo    L(copy16)
+        ldp     A_l, A_h, [src]
+        ldp     D_l, D_h, [srcend1, -16]
+        stp     A_l, A_h, [dstin]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        /* Copy 8-15 bytes.  */
+L(copy16):
+        tbz     count, 3, L(copy8)
+        ldr     A_l, [src]
+        ldr     A_h, [srcend1, -8]
+        str     A_l, [dstin]
+        str     A_h, [dstend1, -8]
+        ret
+
+        .p2align 3
+        /* Copy 4-7 bytes.  */
+L(copy8):
+        tbz     count, 2, L(copy4)
+        ldr     A_lw, [src]
+        ldr     B_lw, [srcend1, -4]
+        str     A_lw, [dstin]
+        str     B_lw, [dstend1, -4]
+        ret
+
+        /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+        cbz     count, L(copy0)
+        lsr     tmp1, count, 1
+        ldrb    A_lw, [src]
+        ldrb    C_lw, [srcend1, -1]
+        ldrb    B_lw, [src, tmp1]
+        strb    A_lw, [dstin]
+        strb    B_lw, [dstin, tmp1]
+        strb    C_lw, [dstend1, -1]
+L(copy0):
+        ret
+
+        .p2align 4
+        /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+        ldp     A_l, A_h, [src]
+        ldp     B_l, B_h, [src, 16]
+        ldp     C_l, C_h, [srcend1, -32]
+        ldp     D_l, D_h, [srcend1, -16]
+        cmp     count, 64
+        b.hi    L(copy128)
+        stp     A_l, A_h, [dstin]
+        stp     B_l, B_h, [dstin, 16]
+        stp     C_l, C_h, [dstend1, -32]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+        /* Copy 65..128 bytes.  */
+L(copy128):
+        ldp     E_l, E_h, [src, 32]
+        ldp     F_l, F_h, [src, 48]
+        cmp     count, 96
+        b.ls    L(copy96)
+        ldp     G_l, G_h, [srcend1, -64]
+        ldp     H_l, H_h, [srcend1, -48]
+        stp     G_l, G_h, [dstend1, -64]
+        stp     H_l, H_h, [dstend1, -48]
+L(copy96):
+        stp     A_l, A_h, [dstin]
+        stp     B_l, B_h, [dstin, 16]
+        stp     E_l, E_h, [dstin, 32]
+        stp     F_l, F_h, [dstin, 48]
+        stp     C_l, C_h, [dstend1, -32]
+        stp     D_l, D_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+        /* Copy more than 128 bytes.  */
+L(copy_long):
+        /* Use backwards copy if there is an overlap.  */
+        sub     tmp1, dstin, src
+        cbz     tmp1, L(copy0)
+        cmp     tmp1, count
+        b.lo    L(copy_long_backwards)
+
+        /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+        ldp     D_l, D_h, [src]
+        and     tmp1, dstin, 15
+        bic     dst, dstin, 15
+        sub     src, src, tmp1
+        add     count, count, tmp1      /* Count is now 16 too large.  */
+        ldp     A_l, A_h, [src, 16]
+        stp     D_l, D_h, [dstin]
+        ldp     B_l, B_h, [src, 32]
+        ldp     C_l, C_h, [src, 48]
+        ldp     D_l, D_h, [src, 64]!
+        subs    count, count, 128 + 16  /* Test and readjust count.  */
+        b.ls    L(copy64_from_end)
+L(loop64):
+        stp     A_l, A_h, [dst, 16]
+        ldp     A_l, A_h, [src, 16]
+        stp     B_l, B_h, [dst, 32]
+        ldp     B_l, B_h, [src, 32]
+        stp     C_l, C_h, [dst, 48]
+        ldp     C_l, C_h, [src, 48]
+        stp     D_l, D_h, [dst, 64]!
+        ldp     D_l, D_h, [src, 64]!
+        subs    count, count, 64
+        b.hi    L(loop64)
+
+        /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+        ldp     E_l, E_h, [srcend1, -64]
+        stp     A_l, A_h, [dst, 16]
+        ldp     A_l, A_h, [srcend1, -48]
+        stp     B_l, B_h, [dst, 32]
+        ldp     B_l, B_h, [srcend1, -32]
+        stp     C_l, C_h, [dst, 48]
+        ldp     C_l, C_h, [srcend1, -16]
+        stp     D_l, D_h, [dst, 64]
+        stp     E_l, E_h, [dstend1, -64]
+        stp     A_l, A_h, [dstend1, -48]
+        stp     B_l, B_h, [dstend1, -32]
+        stp     C_l, C_h, [dstend1, -16]
+        ret
+
+        .p2align 4
+
+        /* Large backwards copy for overlapping copies.
+           Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+        ldp     D_l, D_h, [srcend1, -16]
+        and     tmp1, dstend1, 15
+        sub     srcend1, srcend1, tmp1
+        sub     count, count, tmp1
+        ldp     A_l, A_h, [srcend1, -16]
+        stp     D_l, D_h, [dstend1, -16]
+        ldp     B_l, B_h, [srcend1, -32]
+        ldp     C_l, C_h, [srcend1, -48]
+        ldp     D_l, D_h, [srcend1, -64]!
+        sub     dstend1, dstend1, tmp1
+        subs    count, count, 128
+        b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+        stp     A_l, A_h, [dstend1, -16]
+        ldp     A_l, A_h, [srcend1, -16]
+        stp     B_l, B_h, [dstend1, -32]
+        ldp     B_l, B_h, [srcend1, -32]
+        stp     C_l, C_h, [dstend1, -48]
+        ldp     C_l, C_h, [srcend1, -48]
+        stp     D_l, D_h, [dstend1, -64]!
+        ldp     D_l, D_h, [srcend1, -64]!
+        subs    count, count, 64
+        b.hi    L(loop64_backwards)
+
+        /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+        ldp     G_l, G_h, [src, 48]
+        stp     A_l, A_h, [dstend1, -16]
+        ldp     A_l, A_h, [src, 32]
+        stp     B_l, B_h, [dstend1, -32]
+        ldp     B_l, B_h, [src, 16]
+        stp     C_l, C_h, [dstend1, -48]
+        ldp     C_l, C_h, [src]
+        stp     D_l, D_h, [dstend1, -64]
+        stp     G_l, G_h, [dstin, 48]
+        stp     A_l, A_h, [dstin, 32]
+        stp     B_l, B_h, [dstin, 16]
+        stp     C_l, C_h, [dstin]
+        ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
+
+
+//
+//  __arm_sc_memset
+//
+
+#define dstin    x0
+#define val      x1
+#define valw     w1
+#define count    x2
+#define dst      x3
+#define dstend2  x4
+#define zva_val  x5
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
+#ifdef __ARM_FEATURE_SVE
+        mov     z0.b, valw
+#else
+        bfi valw, valw, #8, #8
+        bfi valw, valw, #16, #16
+        bfi val, val, #32, #32
+        fmov d0, val
+        fmov v0.d[1], val
+#endif
+        add     dstend2, dstin, count
+
+        cmp     count, 96
+        b.hi    L(set_long)
+        cmp     count, 16
+        b.hs    L(set_medium)
+        mov     val, v0.D[0]
+
+        /* Set 0..15 bytes.  */
+        tbz     count, 3, 1f
+        str     val, [dstin]
+        str     val, [dstend2, -8]
+        ret
+        nop
+1:      tbz     count, 2, 2f
+        str     valw, [dstin]
+        str     valw, [dstend2, -4]
+        ret
+2:      cbz     count, 3f
+        strb    valw, [dstin]
+        tbz     count, 1, 3f
+        strh    valw, [dstend2, -2]
+3:      ret
+
+        /* Set 17..96 bytes.  */
+L(set_medium):
+        str     q0, [dstin]
+        tbnz    count, 6, L(set96)
+        str     q0, [dstend2, -16]
+        tbz     count, 5, 1f
+        str     q0, [dstin, 16]
+        str     q0, [dstend2, -32]
+1:      ret
+
+        .p2align 4
+        /* Set 64..96 bytes.  Write 64 bytes from the start and
+           32 bytes from the end.  */
+L(set96):
+        str     q0, [dstin, 16]
+        stp     q0, q0, [dstin, 32]
+        stp     q0, q0, [dstend2, -32]
+        ret
+
+        .p2align 4
+L(set_long):
+        and     valw, valw, 255
+        bic     dst, dstin, 15
+        str     q0, [dstin]
+        cmp     count, 160
+        ccmp    valw, 0, 0, hs
+        b.ne    L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+        mrs     zva_val, dczid_el0
+        and     zva_val, zva_val, 31
+        cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
+        b.ne    L(no_zva)
+#endif
+        str     q0, [dst, 16]
+        stp     q0, q0, [dst, 32]
+        bic     dst, dst, 63
+        sub     count, dstend2, dst      /* Count is now 64 too large.  */
+        sub     count, count, 128       /* Adjust count and bias for loop.  */
+
+        .p2align 4
+L(zva_loop):
+        add     dst, dst, 64
+        dc      zva, dst
+        subs    count, count, 64
+        b.hi    L(zva_loop)
+        stp     q0, q0, [dstend2, -64]
+        stp     q0, q0, [dstend2, -32]
+        ret
+
+L(no_zva):
+        sub     count, dstend2, dst      /* Count is 16 too large.  */
+        sub     dst, dst, 16            /* Dst is biased by -32.  */
+        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
+L(no_zva_loop):
+        stp     q0, q0, [dst, 32]
+        stp     q0, q0, [dst, 64]!
+        subs    count, count, 64
+        b.hi    L(no_zva_loop)
+        stp     q0, q0, [dstend2, -64]
+        stp     q0, q0, [dstend2, -32]
+        ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
+
+#endif // __aarch64__
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
new file mode 100644
index 0000000..315490e
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -0,0 +1,12 @@
+#include <stddef.h>
+
+const void *__arm_sc_memchr(const void *src, int c,
+                            size_t n) __arm_streaming_compatible {
+  const unsigned char *srcp = (const unsigned char *)src;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    if (srcp[i] == c8)
+      return &srcp[i];
+
+  return NULL;
+}
diff --git a/compiler-rt/lib/builtins/absvdi2.c b/compiler-rt/lib/builtins/absvdi2.c
index b9566cd..291ab5f 100644
--- a/compiler-rt/lib/builtins/absvdi2.c
+++ b/compiler-rt/lib/builtins/absvdi2.c
@@ -18,7 +18,7 @@
 
 COMPILER_RT_ABI di_int __absvdi2(di_int a) {
   const int N = (int)(sizeof(di_int) * CHAR_BIT);
-  if (a == ((di_int)1 << (N - 1)))
+  if (a == ((di_int)((du_int)1 << (N - 1))))
     compilerrt_abort();
   const di_int t = a >> (N - 1);
   return (a ^ t) - t;
diff --git a/compiler-rt/lib/builtins/absvsi2.c b/compiler-rt/lib/builtins/absvsi2.c
index 9d5de7e..9977c33 100644
--- a/compiler-rt/lib/builtins/absvsi2.c
+++ b/compiler-rt/lib/builtins/absvsi2.c
@@ -18,7 +18,7 @@
 
 COMPILER_RT_ABI si_int __absvsi2(si_int a) {
   const int N = (int)(sizeof(si_int) * CHAR_BIT);
-  if (a == ((si_int)1 << (N - 1)))
+  if (a == ((si_int)((su_int)1 << (N - 1))))
     compilerrt_abort();
   const si_int t = a >> (N - 1);
   return (a ^ t) - t;
diff --git a/compiler-rt/lib/builtins/absvti2.c b/compiler-rt/lib/builtins/absvti2.c
index 491d99d..bc6933b 100644
--- a/compiler-rt/lib/builtins/absvti2.c
+++ b/compiler-rt/lib/builtins/absvti2.c
@@ -20,7 +20,7 @@
 
 COMPILER_RT_ABI ti_int __absvti2(ti_int a) {
   const int N = (int)(sizeof(ti_int) * CHAR_BIT);
-  if (a == ((ti_int)1 << (N - 1)))
+  if (a == (ti_int)((tu_int)1 << (N - 1)))
     compilerrt_abort();
   const ti_int s = a >> (N - 1);
   return (a ^ s) - s;
diff --git a/compiler-rt/lib/builtins/addtf3.c b/compiler-rt/lib/builtins/addtf3.c
index 86e4f4c..2cb3a4d 100644
--- a/compiler-rt/lib/builtins/addtf3.c
+++ b/compiler-rt/lib/builtins/addtf3.c
@@ -13,7 +13,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 #include "fp_add_impl.inc"
 
 COMPILER_RT_ABI fp_t __addtf3(fp_t a, fp_t b) {
diff --git a/compiler-rt/lib/builtins/arm/aeabi_cdcmp.S b/compiler-rt/lib/builtins/arm/aeabi_cdcmp.S
index bd039a0..c7abdb0 100644
--- a/compiler-rt/lib/builtins/arm/aeabi_cdcmp.S
+++ b/compiler-rt/lib/builtins/arm/aeabi_cdcmp.S
@@ -8,10 +8,6 @@
 
 #include "../assembly.h"
 
-#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
-#error big endian support not implemented
-#endif
-
 #define APSR_Z (1 << 30)
 #define APSR_C (1 << 29)
 
diff --git a/compiler-rt/lib/builtins/arm/aeabi_cfcmp.S b/compiler-rt/lib/builtins/arm/aeabi_cfcmp.S
index a26cb2a..81c4766 100644
--- a/compiler-rt/lib/builtins/arm/aeabi_cfcmp.S
+++ b/compiler-rt/lib/builtins/arm/aeabi_cfcmp.S
@@ -8,10 +8,6 @@
 
 #include "../assembly.h"
 
-#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
-#error big endian support not implemented
-#endif
-
 #define APSR_Z (1 << 30)
 #define APSR_C (1 << 29)
 
diff --git a/compiler-rt/lib/builtins/arm/divsi3.S b/compiler-rt/lib/builtins/arm/divsi3.S
index 761bf49..faf9af9 100644
--- a/compiler-rt/lib/builtins/arm/divsi3.S
+++ b/compiler-rt/lib/builtins/arm/divsi3.S
@@ -37,7 +37,8 @@ DEFINE_COMPILERRT_FUNCTION(__divsi3)
    sdiv    r0, r0, r1
    bx      lr
 LOCAL_LABEL(divzero):
-   mov     r0,#0
+   // Use movs for compatibility with v8-m.base.
+   movs    r0,#0
    bx      lr
 #else
 ESTABLISH_FRAME
diff --git a/compiler-rt/lib/builtins/arm/udivsi3.S b/compiler-rt/lib/builtins/arm/udivsi3.S
index 9b1b035..16528e8 100644
--- a/compiler-rt/lib/builtins/arm/udivsi3.S
+++ b/compiler-rt/lib/builtins/arm/udivsi3.S
@@ -32,7 +32,8 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
 	bx  	lr
 
 LOCAL_LABEL(divby0):
-	mov     r0, #0
+	// Use movs for compatibility with v8-m.base.
+	movs    r0, #0
 #  ifdef __ARM_EABI__
 	b       __aeabi_idiv0
 #  else
@@ -203,7 +204,7 @@ LOCAL_LABEL(divby0):
 	LOCAL_LABEL(block_skip_##shift) :;                                           \
 	adcs r3, r3 // same as ((r3 << 1) | Carry). Carry is set if r0 >= r2.
 
-	// TODO: if current location counter is not not word aligned, we don't
+	// TODO: if current location counter is not word aligned, we don't
 	// need the .p2align and nop
 	// Label div0block must be word-aligned. First align block 31
 	.p2align 2
diff --git a/compiler-rt/lib/builtins/ashldi3.c b/compiler-rt/lib/builtins/ashldi3.c
index 04f2222..7b835da 100644
--- a/compiler-rt/lib/builtins/ashldi3.c
+++ b/compiler-rt/lib/builtins/ashldi3.c
@@ -28,7 +28,8 @@ COMPILER_RT_ABI di_int __ashldi3(di_int a, int b) {
     if (b == 0)
       return a;
     result.s.low = input.s.low << b;
-    result.s.high = (input.s.high << b) | (input.s.low >> (bits_in_word - b));
+    result.s.high =
+        ((su_int)input.s.high << b) | (input.s.low >> (bits_in_word - b));
   }
   return result.all;
 }
diff --git a/compiler-rt/lib/builtins/ashlti3.c b/compiler-rt/lib/builtins/ashlti3.c
index 2d7bd4a..2bebf10 100644
--- a/compiler-rt/lib/builtins/ashlti3.c
+++ b/compiler-rt/lib/builtins/ashlti3.c
@@ -18,7 +18,7 @@
 
 // Precondition:  0 <= b < bits_in_tword
 
-COMPILER_RT_ABI ti_int __ashlti3(ti_int a, si_int b) {
+COMPILER_RT_ABI ti_int __ashlti3(ti_int a, int b) {
   const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
   twords input;
   twords result;
@@ -30,7 +30,8 @@ COMPILER_RT_ABI ti_int __ashlti3(ti_int a, si_int b) {
     if (b == 0)
       return a;
     result.s.low = input.s.low << b;
-    result.s.high = (input.s.high << b) | (input.s.low >> (bits_in_dword - b));
+    result.s.high =
+        ((du_int)input.s.high << b) | (input.s.low >> (bits_in_dword - b));
   }
   return result.all;
 }
diff --git a/compiler-rt/lib/builtins/ashrdi3.c b/compiler-rt/lib/builtins/ashrdi3.c
index 934a5c4..c0879b8 100644
--- a/compiler-rt/lib/builtins/ashrdi3.c
+++ b/compiler-rt/lib/builtins/ashrdi3.c
@@ -29,7 +29,8 @@ COMPILER_RT_ABI di_int __ashrdi3(di_int a, int b) {
     if (b == 0)
       return a;
     result.s.high = input.s.high >> b;
-    result.s.low = (input.s.high << (bits_in_word - b)) | (input.s.low >> b);
+    result.s.low =
+        ((su_int)input.s.high << (bits_in_word - b)) | (input.s.low >> b);
   }
   return result.all;
 }
diff --git a/compiler-rt/lib/builtins/ashrti3.c b/compiler-rt/lib/builtins/ashrti3.c
index f573b6d..d6b1ad9 100644
--- a/compiler-rt/lib/builtins/ashrti3.c
+++ b/compiler-rt/lib/builtins/ashrti3.c
@@ -18,7 +18,7 @@
 
 // Precondition:  0 <= b < bits_in_tword
 
-COMPILER_RT_ABI ti_int __ashrti3(ti_int a, si_int b) {
+COMPILER_RT_ABI ti_int __ashrti3(ti_int a, int b) {
   const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
   twords input;
   twords result;
@@ -31,7 +31,8 @@ COMPILER_RT_ABI ti_int __ashrti3(ti_int a, si_int b) {
     if (b == 0)
       return a;
     result.s.high = input.s.high >> b;
-    result.s.low = (input.s.high << (bits_in_dword - b)) | (input.s.low >> b);
+    result.s.low =
+        ((du_int)input.s.high << (bits_in_dword - b)) | (input.s.low >> b);
   }
   return result.all;
 }
diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h
index 69a3d86..8c42fc7 100644
--- a/compiler-rt/lib/builtins/assembly.h
+++ b/compiler-rt/lib/builtins/assembly.h
@@ -260,14 +260,15 @@
   .globl name SEPARATOR                                                        \
   SYMBOL_IS_FUNC(name) SEPARATOR                                               \
   DECLARE_SYMBOL_VISIBILITY_UNMANGLED(name) SEPARATOR                          \
-  CFI_START SEPARATOR                                                          \
   DECLARE_FUNC_ENCODING                                                        \
-  name: SEPARATOR BTI_C
+  name:                                                                        \
+  SEPARATOR CFI_START                                                          \
+  SEPARATOR BTI_C
 
 #define DEFINE_COMPILERRT_FUNCTION_ALIAS(name, target)                         \
   .globl SYMBOL_NAME(name) SEPARATOR                                           \
   SYMBOL_IS_FUNC(SYMBOL_NAME(name)) SEPARATOR                                  \
-  DECLARE_SYMBOL_VISIBILITY(SYMBOL_NAME(name)) SEPARATOR                       \
+  DECLARE_SYMBOL_VISIBILITY(name) SEPARATOR                                    \
   .set SYMBOL_NAME(name), SYMBOL_NAME(target) SEPARATOR
 
 #if defined(__ARM_EABI__)
diff --git a/compiler-rt/lib/builtins/atomic.c b/compiler-rt/lib/builtins/atomic.c
index 852bb20..aded25d 100644
--- a/compiler-rt/lib/builtins/atomic.c
+++ b/compiler-rt/lib/builtins/atomic.c
@@ -12,7 +12,7 @@
 //
 //  1) This code must work with C programs that do not link to anything
 //     (including pthreads) and so it should not depend on any pthread
-//     functions.
+//     functions. If the user wishes to opt into using pthreads, they may do so.
 //  2) Atomic operations, rather than explicit mutexes, are most commonly used
 //     on code where contended operations are rate.
 //
@@ -56,7 +56,17 @@ static const long SPINLOCK_MASK = SPINLOCK_COUNT - 1;
 // defined.  Each platform should define the Lock type, and corresponding
 // lock() and unlock() functions.
 ////////////////////////////////////////////////////////////////////////////////
-#if defined(__FreeBSD__) || defined(__DragonFly__)
+#if defined(_LIBATOMIC_USE_PTHREAD)
+#include <pthread.h>
+typedef pthread_mutex_t Lock;
+/// Unlock a lock.  This is a release operation.
+__inline static void unlock(Lock *l) { pthread_mutex_unlock(l); }
+/// Locks a lock.
+__inline static void lock(Lock *l) { pthread_mutex_lock(l); }
+/// locks for atomic operations
+static Lock locks[SPINLOCK_COUNT];
+
+#elif defined(__FreeBSD__) || defined(__DragonFly__)
 #include <errno.h>
 // clang-format off
 #include <sys/types.h>
diff --git a/compiler-rt/lib/builtins/clear_cache.c b/compiler-rt/lib/builtins/clear_cache.c
index 8993761..2ac99b2 100644
--- a/compiler-rt/lib/builtins/clear_cache.c
+++ b/compiler-rt/lib/builtins/clear_cache.c
@@ -110,10 +110,14 @@ void __clear_cache(void *start, void *end) {
                      "jr.hb $at\n"
                      "move $at, $0\n"
                      ".set at");
-#else
+#elif defined(__linux__) || defined(__OpenBSD__)
     // Pre-R6 may not be globalized. And some implementations may give strange
     // synci_step. So, let's use libc call for it.
-    cacheflush(start, end_int - start_int, BCACHE);
+    _flush_cache(start, end_int - start_int, BCACHE);
+#else
+    (void)start_int;
+    (void)end_int;
+    compilerrt_abort();
 #endif
   }
 #elif defined(__aarch64__) && !defined(__APPLE__)
diff --git a/compiler-rt/lib/builtins/comparetf2.c b/compiler-rt/lib/builtins/comparetf2.c
index f159245..be5e9e5 100644
--- a/compiler-rt/lib/builtins/comparetf2.c
+++ b/compiler-rt/lib/builtins/comparetf2.c
@@ -39,7 +39,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 #include "fp_compare_impl.inc"
 
 COMPILER_RT_ABI CMP_RESULT __letf2(fp_t a, fp_t b) { return __leXf2__(a, b); }
diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c
deleted file mode 100644
index f5ad530..0000000
--- a/compiler-rt/lib/builtins/cpu_model.c
+++ /dev/null
@@ -1,1357 +0,0 @@
-//===-- cpu_model.c - Support for __cpu_model builtin  ------------*- C -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file is based on LLVM's lib/Support/Host.cpp.
-//  It implements the operating system Host concept and builtin
-//  __cpu_model for the compiler_rt library for x86 and
-//  __aarch64_have_lse_atomics, __aarch64_cpu_features for AArch64.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __has_attribute
-#define __has_attribute(attr) 0
-#endif
-
-#if __has_attribute(constructor)
-#if __GNUC__ >= 9
-// Ordinarily init priorities below 101 are disallowed as they are reserved for the
-// implementation. However, we are the implementation, so silence the diagnostic,
-// since it doesn't apply to us.
-#pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
-#endif
-// We're choosing init priority 90 to force our constructors to run before any
-// constructors in the end user application (starting at priority 101). This value
-// matches the libgcc choice for the same functions.
-#define CONSTRUCTOR_ATTRIBUTE __attribute__((constructor(90)))
-#else
-// FIXME: For MSVC, we should make a function pointer global in .CRT$X?? so that
-// this runs during initialization.
-#define CONSTRUCTOR_ATTRIBUTE
-#endif
-
-#if (defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||           \
-     defined(_M_X64)) &&                                                       \
-    (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER))
-
-#include <assert.h>
-
-#define bool int
-#define true 1
-#define false 0
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-enum VendorSignatures {
-  SIG_INTEL = 0x756e6547, // Genu
-  SIG_AMD = 0x68747541,   // Auth
-};
-
-enum ProcessorVendors {
-  VENDOR_INTEL = 1,
-  VENDOR_AMD,
-  VENDOR_OTHER,
-  VENDOR_MAX
-};
-
-enum ProcessorTypes {
-  INTEL_BONNELL = 1,
-  INTEL_CORE2,
-  INTEL_COREI7,
-  AMDFAM10H,
-  AMDFAM15H,
-  INTEL_SILVERMONT,
-  INTEL_KNL,
-  AMD_BTVER1,
-  AMD_BTVER2,
-  AMDFAM17H,
-  INTEL_KNM,
-  INTEL_GOLDMONT,
-  INTEL_GOLDMONT_PLUS,
-  INTEL_TREMONT,
-  AMDFAM19H,
-  ZHAOXIN_FAM7H,
-  INTEL_SIERRAFOREST,
-  INTEL_GRANDRIDGE,
-  CPU_TYPE_MAX
-};
-
-enum ProcessorSubtypes {
-  INTEL_COREI7_NEHALEM = 1,
-  INTEL_COREI7_WESTMERE,
-  INTEL_COREI7_SANDYBRIDGE,
-  AMDFAM10H_BARCELONA,
-  AMDFAM10H_SHANGHAI,
-  AMDFAM10H_ISTANBUL,
-  AMDFAM15H_BDVER1,
-  AMDFAM15H_BDVER2,
-  AMDFAM15H_BDVER3,
-  AMDFAM15H_BDVER4,
-  AMDFAM17H_ZNVER1,
-  INTEL_COREI7_IVYBRIDGE,
-  INTEL_COREI7_HASWELL,
-  INTEL_COREI7_BROADWELL,
-  INTEL_COREI7_SKYLAKE,
-  INTEL_COREI7_SKYLAKE_AVX512,
-  INTEL_COREI7_CANNONLAKE,
-  INTEL_COREI7_ICELAKE_CLIENT,
-  INTEL_COREI7_ICELAKE_SERVER,
-  AMDFAM17H_ZNVER2,
-  INTEL_COREI7_CASCADELAKE,
-  INTEL_COREI7_TIGERLAKE,
-  INTEL_COREI7_COOPERLAKE,
-  INTEL_COREI7_SAPPHIRERAPIDS,
-  INTEL_COREI7_ALDERLAKE,
-  AMDFAM19H_ZNVER3,
-  INTEL_COREI7_ROCKETLAKE,
-  ZHAOXIN_FAM7H_LUJIAZUI,
-  AMDFAM19H_ZNVER4,
-  INTEL_COREI7_GRANITERAPIDS,
-  CPU_SUBTYPE_MAX
-};
-
-enum ProcessorFeatures {
-  FEATURE_CMOV = 0,
-  FEATURE_MMX,
-  FEATURE_POPCNT,
-  FEATURE_SSE,
-  FEATURE_SSE2,
-  FEATURE_SSE3,
-  FEATURE_SSSE3,
-  FEATURE_SSE4_1,
-  FEATURE_SSE4_2,
-  FEATURE_AVX,
-  FEATURE_AVX2,
-  FEATURE_SSE4_A,
-  FEATURE_FMA4,
-  FEATURE_XOP,
-  FEATURE_FMA,
-  FEATURE_AVX512F,
-  FEATURE_BMI,
-  FEATURE_BMI2,
-  FEATURE_AES,
-  FEATURE_PCLMUL,
-  FEATURE_AVX512VL,
-  FEATURE_AVX512BW,
-  FEATURE_AVX512DQ,
-  FEATURE_AVX512CD,
-  FEATURE_AVX512ER,
-  FEATURE_AVX512PF,
-  FEATURE_AVX512VBMI,
-  FEATURE_AVX512IFMA,
-  FEATURE_AVX5124VNNIW,
-  FEATURE_AVX5124FMAPS,
-  FEATURE_AVX512VPOPCNTDQ,
-  FEATURE_AVX512VBMI2,
-  FEATURE_GFNI,
-  FEATURE_VPCLMULQDQ,
-  FEATURE_AVX512VNNI,
-  FEATURE_AVX512BITALG,
-  FEATURE_AVX512BF16,
-  FEATURE_AVX512VP2INTERSECT,
-  CPU_FEATURE_MAX
-};
-
-// The check below for i386 was copied from clang's cpuid.h (__get_cpuid_max).
-// Check motivated by bug reports for OpenSSL crashing on CPUs without CPUID
-// support. Consequently, for i386, the presence of CPUID is checked first
-// via the corresponding eflags bit.
-static bool isCpuIdSupported(void) {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__i386__)
-  int __cpuid_supported;
-  __asm__("  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   %%eax,%%ecx\n"
-          "  xorl   $0x00200000,%%eax\n"
-          "  pushl  %%eax\n"
-          "  popfl\n"
-          "  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   $0,%0\n"
-          "  cmpl   %%eax,%%ecx\n"
-          "  je     1f\n"
-          "  movl   $1,%0\n"
-          "1:"
-          : "=r"(__cpuid_supported)
-          :
-          : "eax", "ecx");
-  if (!__cpuid_supported)
-    return false;
-#endif
-  return true;
-#endif
-  return true;
-}
-
-// This code is copied from lib/Support/Host.cpp.
-// Changes to either file should be mirrored in the other.
-
-/// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in
-/// the specified arguments.  If we can't run cpuid on the host, return true.
-static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
-                               unsigned *rECX, unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__x86_64__)
-  // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
-  // FIXME: should we save this for Clang?
-  __asm__("movq\t%%rbx, %%rsi\n\t"
-          "cpuid\n\t"
-          "xchgq\t%%rbx, %%rsi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value));
-  return false;
-#elif defined(__i386__)
-  __asm__("movl\t%%ebx, %%esi\n\t"
-          "cpuid\n\t"
-          "xchgl\t%%ebx, %%esi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value));
-  return false;
-#else
-  return true;
-#endif
-#elif defined(_MSC_VER)
-  // The MSVC intrinsic is portable across x86 and x64.
-  int registers[4];
-  __cpuid(registers, value);
-  *rEAX = registers[0];
-  *rEBX = registers[1];
-  *rECX = registers[2];
-  *rEDX = registers[3];
-  return false;
-#else
-  return true;
-#endif
-}
-
-/// getX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return
-/// the 4 values in the specified arguments.  If we can't run cpuid on the host,
-/// return true.
-static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
-                                 unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
-                                 unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__)
-#if defined(__x86_64__)
-  // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
-  // FIXME: should we save this for Clang?
-  __asm__("movq\t%%rbx, %%rsi\n\t"
-          "cpuid\n\t"
-          "xchgq\t%%rbx, %%rsi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value), "c"(subleaf));
-  return false;
-#elif defined(__i386__)
-  __asm__("movl\t%%ebx, %%esi\n\t"
-          "cpuid\n\t"
-          "xchgl\t%%ebx, %%esi\n\t"
-          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
-          : "a"(value), "c"(subleaf));
-  return false;
-#else
-  return true;
-#endif
-#elif defined(_MSC_VER)
-  int registers[4];
-  __cpuidex(registers, value, subleaf);
-  *rEAX = registers[0];
-  *rEBX = registers[1];
-  *rECX = registers[2];
-  *rEDX = registers[3];
-  return false;
-#else
-  return true;
-#endif
-}
-
-// Read control register 0 (XCR0). Used to detect features such as AVX.
-static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__)
-  // Check xgetbv; this uses a .byte sequence instead of the instruction
-  // directly because older assemblers do not include support for xgetbv and
-  // there is no easy way to conditionally compile based on the assembler used.
-  __asm__(".byte 0x0f, 0x01, 0xd0" : "=a"(*rEAX), "=d"(*rEDX) : "c"(0));
-  return false;
-#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
-  unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-  *rEAX = Result;
-  *rEDX = Result >> 32;
-  return false;
-#else
-  return true;
-#endif
-}
-
-static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
-                                 unsigned *Model) {
-  *Family = (EAX >> 8) & 0xf; // Bits 8 - 11
-  *Model = (EAX >> 4) & 0xf;  // Bits 4 - 7
-  if (*Family == 6 || *Family == 0xf) {
-    if (*Family == 0xf)
-      // Examine extended family ID if family ID is F.
-      *Family += (EAX >> 20) & 0xff; // Bits 20 - 27
-    // Examine extended model ID if family ID is 6 or F.
-    *Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
-  }
-}
-
-static const char *
-getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
-                                const unsigned *Features,
-                                unsigned *Type, unsigned *Subtype) {
-#define testFeature(F)                                                         \
-  (Features[F / 32] & (1 << (F % 32))) != 0
-
-  // We select CPU strings to match the code in Host.cpp, but we don't use them
-  // in compiler-rt.
-  const char *CPU = 0;
-
-  switch (Family) {
-  case 6:
-    switch (Model) {
-    case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
-               // processor, Intel Core 2 Quad processor, Intel Core 2 Quad
-               // mobile processor, Intel Core 2 Extreme processor, Intel
-               // Pentium Dual-Core processor, Intel Xeon processor, model
-               // 0Fh. All processors are manufactured using the 65 nm process.
-    case 0x16: // Intel Celeron processor model 16h. All processors are
-               // manufactured using the 65 nm process
-      CPU = "core2";
-      *Type = INTEL_CORE2;
-      break;
-    case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model
-               // 17h. All processors are manufactured using the 45 nm process.
-               //
-               // 45nm: Penryn , Wolfdale, Yorkfield (XE)
-    case 0x1d: // Intel Xeon processor MP. All processors are manufactured using
-               // the 45 nm process.
-      CPU = "penryn";
-      *Type = INTEL_CORE2;
-      break;
-    case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All
-               // processors are manufactured using the 45 nm process.
-    case 0x1e: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
-               // As found in a Summer 2010 model iMac.
-    case 0x1f:
-    case 0x2e:              // Nehalem EX
-      CPU = "nehalem";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_NEHALEM;
-      break;
-    case 0x25: // Intel Core i7, laptop version.
-    case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All
-               // processors are manufactured using the 32 nm process.
-    case 0x2f: // Westmere EX
-      CPU = "westmere";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_WESTMERE;
-      break;
-    case 0x2a: // Intel Core i7 processor. All processors are manufactured
-               // using the 32 nm process.
-    case 0x2d:
-      CPU = "sandybridge";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_SANDYBRIDGE;
-      break;
-    case 0x3a:
-    case 0x3e:              // Ivy Bridge EP
-      CPU = "ivybridge";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_IVYBRIDGE;
-      break;
-
-    // Haswell:
-    case 0x3c:
-    case 0x3f:
-    case 0x45:
-    case 0x46:
-      CPU = "haswell";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_HASWELL;
-      break;
-
-    // Broadwell:
-    case 0x3d:
-    case 0x47:
-    case 0x4f:
-    case 0x56:
-      CPU = "broadwell";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_BROADWELL;
-      break;
-
-    // Skylake:
-    case 0x4e:              // Skylake mobile
-    case 0x5e:              // Skylake desktop
-    case 0x8e:              // Kaby Lake mobile
-    case 0x9e:              // Kaby Lake desktop
-    case 0xa5:              // Comet Lake-H/S
-    case 0xa6:              // Comet Lake-U
-      CPU = "skylake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_SKYLAKE;
-      break;
-
-    // Rocketlake:
-    case 0xa7:
-      CPU = "rocketlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ROCKETLAKE;
-      break;
-
-    // Skylake Xeon:
-    case 0x55:
-      *Type = INTEL_COREI7;
-      if (testFeature(FEATURE_AVX512BF16)) {
-        CPU = "cooperlake";
-        *Subtype = INTEL_COREI7_COOPERLAKE;
-      } else if (testFeature(FEATURE_AVX512VNNI)) {
-        CPU = "cascadelake";
-        *Subtype = INTEL_COREI7_CASCADELAKE;
-      } else {
-        CPU = "skylake-avx512";
-        *Subtype = INTEL_COREI7_SKYLAKE_AVX512;
-      }
-      break;
-
-    // Cannonlake:
-    case 0x66:
-      CPU = "cannonlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_CANNONLAKE;
-      break;
-
-    // Icelake:
-    case 0x7d:
-    case 0x7e:
-      CPU = "icelake-client";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ICELAKE_CLIENT;
-      break;
-
-    // Tigerlake:
-    case 0x8c:
-    case 0x8d:
-      CPU = "tigerlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_TIGERLAKE;
-      break;
-
-    // Alderlake:
-    case 0x97:
-    case 0x9a:
-    // Raptorlake:
-    case 0xb7:
-    // Meteorlake:
-    case 0xaa:
-    case 0xac:
-      CPU = "alderlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ALDERLAKE;
-      break;
-
-    // Icelake Xeon:
-    case 0x6a:
-    case 0x6c:
-      CPU = "icelake-server";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ICELAKE_SERVER;
-      break;
-
-    // Emerald Rapids:
-    case 0xcf:
-    // Sapphire Rapids:
-    case 0x8f:
-      CPU = "sapphirerapids";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_SAPPHIRERAPIDS;
-      break;
-
-    // Granite Rapids:
-    case 0xae:
-    case 0xad:
-      CPU = "graniterapids";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_GRANITERAPIDS;
-      break;
-
-    case 0x1c: // Most 45 nm Intel Atom processors
-    case 0x26: // 45 nm Atom Lincroft
-    case 0x27: // 32 nm Atom Medfield
-    case 0x35: // 32 nm Atom Midview
-    case 0x36: // 32 nm Atom Midview
-      CPU = "bonnell";
-      *Type = INTEL_BONNELL;
-      break;
-
-    // Atom Silvermont codes from the Intel software optimization guide.
-    case 0x37:
-    case 0x4a:
-    case 0x4d:
-    case 0x5a:
-    case 0x5d:
-    case 0x4c: // really airmont
-      CPU = "silvermont";
-      *Type = INTEL_SILVERMONT;
-      break;
-    // Goldmont:
-    case 0x5c: // Apollo Lake
-    case 0x5f: // Denverton
-      CPU = "goldmont";
-      *Type = INTEL_GOLDMONT;
-      break; // "goldmont"
-    case 0x7a:
-      CPU = "goldmont-plus";
-      *Type = INTEL_GOLDMONT_PLUS;
-      break;
-    case 0x86:
-      CPU = "tremont";
-      *Type = INTEL_TREMONT;
-      break;
-
-    // Sierraforest:
-    case 0xaf:
-      CPU = "sierraforest";
-      *Type = INTEL_SIERRAFOREST;
-      break;
-
-    // Grandridge:
-    case 0xb6:
-      CPU = "grandridge";
-      *Type = INTEL_GRANDRIDGE;
-      break;
-
-    case 0x57:
-      CPU = "knl";
-      *Type = INTEL_KNL;
-      break;
-
-    case 0x85:
-      CPU = "knm";
-      *Type = INTEL_KNM;
-      break;
-
-    default: // Unknown family 6 CPU.
-      break;
-    }
-    break;
-  default:
-    break; // Unknown.
-  }
-
-  return CPU;
-}
-
-static const char *
-getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
-                              const unsigned *Features,
-                              unsigned *Type, unsigned *Subtype) {
-  // We select CPU strings to match the code in Host.cpp, but we don't use them
-  // in compiler-rt.
-  const char *CPU = 0;
-
-  switch (Family) {
-  case 16:
-    CPU = "amdfam10";
-    *Type = AMDFAM10H;
-    switch (Model) {
-    case 2:
-      *Subtype = AMDFAM10H_BARCELONA;
-      break;
-    case 4:
-      *Subtype = AMDFAM10H_SHANGHAI;
-      break;
-    case 8:
-      *Subtype = AMDFAM10H_ISTANBUL;
-      break;
-    }
-    break;
-  case 20:
-    CPU = "btver1";
-    *Type = AMD_BTVER1;
-    break;
-  case 21:
-    CPU = "bdver1";
-    *Type = AMDFAM15H;
-    if (Model >= 0x60 && Model <= 0x7f) {
-      CPU = "bdver4";
-      *Subtype = AMDFAM15H_BDVER4;
-      break; // 60h-7Fh: Excavator
-    }
-    if (Model >= 0x30 && Model <= 0x3f) {
-      CPU = "bdver3";
-      *Subtype = AMDFAM15H_BDVER3;
-      break; // 30h-3Fh: Steamroller
-    }
-    if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) {
-      CPU = "bdver2";
-      *Subtype = AMDFAM15H_BDVER2;
-      break; // 02h, 10h-1Fh: Piledriver
-    }
-    if (Model <= 0x0f) {
-      *Subtype = AMDFAM15H_BDVER1;
-      break; // 00h-0Fh: Bulldozer
-    }
-    break;
-  case 22:
-    CPU = "btver2";
-    *Type = AMD_BTVER2;
-    break;
-  case 23:
-    CPU = "znver1";
-    *Type = AMDFAM17H;
-    if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) {
-      CPU = "znver2";
-      *Subtype = AMDFAM17H_ZNVER2;
-      break; // 30h-3fh, 71h: Zen2
-    }
-    if (Model <= 0x0f) {
-      *Subtype = AMDFAM17H_ZNVER1;
-      break; // 00h-0Fh: Zen1
-    }
-    break;
-  case 25:
-    CPU = "znver3";
-    *Type = AMDFAM19H;
-    if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x5f)) {
-      // Family 19h Models 00h-0Fh - Zen3
-      // Family 19h Models 20h-2Fh - Zen3
-      // Family 19h Models 30h-3Fh - Zen3
-      // Family 19h Models 40h-4Fh - Zen3+
-      // Family 19h Models 50h-5Fh - Zen3+
-      *Subtype = AMDFAM19H_ZNVER3;
-      break;
-    }
-    if ((Model >= 0x10 && Model <= 0x1f) ||
-        (Model >= 0x60 && Model <= 0x74) ||
-        (Model >= 0x78 && Model <= 0x7b) ||
-        (Model >= 0xA0 && Model <= 0xAf)) {
-      CPU = "znver4";
-      *Subtype = AMDFAM19H_ZNVER4;
-      break; //  "znver4"
-    }
-    break;
-  default:
-    break; // Unknown AMD CPU.
-  }
-
-  return CPU;
-}
-
-static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
-                                 unsigned *Features) {
-  unsigned EAX, EBX;
-
-#define setFeature(F)                                                          \
-  Features[F / 32] |= 1U << (F % 32)
-
-  if ((EDX >> 15) & 1)
-    setFeature(FEATURE_CMOV);
-  if ((EDX >> 23) & 1)
-    setFeature(FEATURE_MMX);
-  if ((EDX >> 25) & 1)
-    setFeature(FEATURE_SSE);
-  if ((EDX >> 26) & 1)
-    setFeature(FEATURE_SSE2);
-
-  if ((ECX >> 0) & 1)
-    setFeature(FEATURE_SSE3);
-  if ((ECX >> 1) & 1)
-    setFeature(FEATURE_PCLMUL);
-  if ((ECX >> 9) & 1)
-    setFeature(FEATURE_SSSE3);
-  if ((ECX >> 12) & 1)
-    setFeature(FEATURE_FMA);
-  if ((ECX >> 19) & 1)
-    setFeature(FEATURE_SSE4_1);
-  if ((ECX >> 20) & 1)
-    setFeature(FEATURE_SSE4_2);
-  if ((ECX >> 23) & 1)
-    setFeature(FEATURE_POPCNT);
-  if ((ECX >> 25) & 1)
-    setFeature(FEATURE_AES);
-
-  // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
-  // indicates that the AVX registers will be saved and restored on context
-  // switch, then we have full AVX support.
-  const unsigned AVXBits = (1 << 27) | (1 << 28);
-  bool HasAVX = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) &&
-                ((EAX & 0x6) == 0x6);
-#if defined(__APPLE__)
-  // Darwin lazily saves the AVX512 context on first use: trust that the OS will
-  // save the AVX512 context if we use AVX512 instructions, even the bit is not
-  // set right now.
-  bool HasAVX512Save = true;
-#else
-  // AVX512 requires additional context to be saved by the OS.
-  bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
-#endif
-
-  if (HasAVX)
-    setFeature(FEATURE_AVX);
-
-  bool HasLeaf7 =
-      MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
-
-  if (HasLeaf7 && ((EBX >> 3) & 1))
-    setFeature(FEATURE_BMI);
-  if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVX)
-    setFeature(FEATURE_AVX2);
-  if (HasLeaf7 && ((EBX >> 8) & 1))
-    setFeature(FEATURE_BMI2);
-  if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512F);
-  if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512DQ);
-  if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512IFMA);
-  if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512PF);
-  if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512ER);
-  if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512CD);
-  if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512BW);
-  if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VL);
-
-  if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VBMI);
-  if (HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VBMI2);
-  if (HasLeaf7 && ((ECX >> 8) & 1))
-    setFeature(FEATURE_GFNI);
-  if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVX)
-    setFeature(FEATURE_VPCLMULQDQ);
-  if (HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VNNI);
-  if (HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512BITALG);
-  if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VPOPCNTDQ);
-
-  if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX5124VNNIW);
-  if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX5124FMAPS);
-  if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512VP2INTERSECT);
-
-  bool HasLeaf7Subleaf1 =
-      MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
-  if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save)
-    setFeature(FEATURE_AVX512BF16);
-
-  unsigned MaxExtLevel;
-  getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
-
-  bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
-                     !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
-  if (HasExtLeaf1 && ((ECX >> 6) & 1))
-    setFeature(FEATURE_SSE4_A);
-  if (HasExtLeaf1 && ((ECX >> 11) & 1))
-    setFeature(FEATURE_XOP);
-  if (HasExtLeaf1 && ((ECX >> 16) & 1))
-    setFeature(FEATURE_FMA4);
-#undef setFeature
-}
-
-#ifndef _WIN32
-__attribute__((visibility("hidden")))
-#endif
-int __cpu_indicator_init(void) CONSTRUCTOR_ATTRIBUTE;
-
-#ifndef _WIN32
-__attribute__((visibility("hidden")))
-#endif
-struct __processor_model {
-  unsigned int __cpu_vendor;
-  unsigned int __cpu_type;
-  unsigned int __cpu_subtype;
-  unsigned int __cpu_features[1];
-} __cpu_model = {0, 0, 0, {0}};
-
-#ifndef _WIN32
-__attribute__((visibility("hidden")))
-#endif
-unsigned int __cpu_features2 = 0;
-
-// A constructor function that is sets __cpu_model and __cpu_features2 with
-// the right values.  This needs to run only once.  This constructor is
-// given the highest priority and it should run before constructors without
-// the priority set.  However, it still runs after ifunc initializers and
-// needs to be called explicitly there.
-
-int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
-  unsigned EAX, EBX, ECX, EDX;
-  unsigned MaxLeaf = 5;
-  unsigned Vendor;
-  unsigned Model, Family;
-  unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0};
-
-  // This function needs to run just once.
-  if (__cpu_model.__cpu_vendor)
-    return 0;
-
-  if (!isCpuIdSupported() ||
-      getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) {
-    __cpu_model.__cpu_vendor = VENDOR_OTHER;
-    return -1;
-  }
-
-  getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX);
-  detectX86FamilyModel(EAX, &Family, &Model);
-
-  // Find available features.
-  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]);
-
-  assert((sizeof(Features)/sizeof(Features[0])) == 2);
-  __cpu_model.__cpu_features[0] = Features[0];
-  __cpu_features2 = Features[1];
-
-  if (Vendor == SIG_INTEL) {
-    // Get CPU type.
-    getIntelProcessorTypeAndSubtype(Family, Model, &Features[0],
-                                    &(__cpu_model.__cpu_type),
-                                    &(__cpu_model.__cpu_subtype));
-    __cpu_model.__cpu_vendor = VENDOR_INTEL;
-  } else if (Vendor == SIG_AMD) {
-    // Get CPU type.
-    getAMDProcessorTypeAndSubtype(Family, Model, &Features[0],
-                                  &(__cpu_model.__cpu_type),
-                                  &(__cpu_model.__cpu_subtype));
-    __cpu_model.__cpu_vendor = VENDOR_AMD;
-  } else
-    __cpu_model.__cpu_vendor = VENDOR_OTHER;
-
-  assert(__cpu_model.__cpu_vendor < VENDOR_MAX);
-  assert(__cpu_model.__cpu_type < CPU_TYPE_MAX);
-  assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX);
-
-  return 0;
-}
-#elif defined(__aarch64__)
-
-#ifndef AT_HWCAP
-#define AT_HWCAP 16
-#endif
-#ifndef HWCAP_CPUID
-#define HWCAP_CPUID (1 << 11)
-#endif
-#ifndef HWCAP_FP
-#define HWCAP_FP (1 << 0)
-#endif
-#ifndef HWCAP_ASIMD
-#define HWCAP_ASIMD (1 << 1)
-#endif
-#ifndef HWCAP_AES
-#define HWCAP_AES (1 << 3)
-#endif
-#ifndef HWCAP_PMULL
-#define HWCAP_PMULL (1 << 4)
-#endif
-#ifndef HWCAP_SHA1
-#define HWCAP_SHA1 (1 << 5)
-#endif
-#ifndef HWCAP_SHA2
-#define HWCAP_SHA2 (1 << 6)
-#endif
-#ifndef HWCAP_ATOMICS
-#define HWCAP_ATOMICS (1 << 8)
-#endif
-#ifndef HWCAP_FPHP
-#define HWCAP_FPHP (1 << 9)
-#endif
-#ifndef HWCAP_ASIMDHP
-#define HWCAP_ASIMDHP (1 << 10)
-#endif
-#ifndef HWCAP_ASIMDRDM
-#define HWCAP_ASIMDRDM (1 << 12)
-#endif
-#ifndef HWCAP_JSCVT
-#define HWCAP_JSCVT (1 << 13)
-#endif
-#ifndef HWCAP_FCMA
-#define HWCAP_FCMA (1 << 14)
-#endif
-#ifndef HWCAP_LRCPC
-#define HWCAP_LRCPC (1 << 15)
-#endif
-#ifndef HWCAP_DCPOP
-#define HWCAP_DCPOP (1 << 16)
-#endif
-#ifndef HWCAP_SHA3
-#define HWCAP_SHA3 (1 << 17)
-#endif
-#ifndef HWCAP_SM3
-#define HWCAP_SM3 (1 << 18)
-#endif
-#ifndef HWCAP_SM4
-#define HWCAP_SM4 (1 << 19)
-#endif
-#ifndef HWCAP_ASIMDDP
-#define HWCAP_ASIMDDP (1 << 20)
-#endif
-#ifndef HWCAP_SHA512
-#define HWCAP_SHA512 (1 << 21)
-#endif
-#ifndef HWCAP_SVE
-#define HWCAP_SVE (1 << 22)
-#endif
-#ifndef HWCAP_ASIMDFHM
-#define HWCAP_ASIMDFHM (1 << 23)
-#endif
-#ifndef HWCAP_DIT
-#define HWCAP_DIT (1 << 24)
-#endif
-#ifndef HWCAP_ILRCPC
-#define HWCAP_ILRCPC (1 << 26)
-#endif
-#ifndef HWCAP_FLAGM
-#define HWCAP_FLAGM (1 << 27)
-#endif
-#ifndef HWCAP_SSBS
-#define HWCAP_SSBS (1 << 28)
-#endif
-#ifndef HWCAP_SB
-#define HWCAP_SB (1 << 29)
-#endif
-
-#ifndef AT_HWCAP2
-#define AT_HWCAP2 26
-#endif
-#ifndef HWCAP2_DCPODP
-#define HWCAP2_DCPODP (1 << 0)
-#endif
-#ifndef HWCAP2_SVE2
-#define HWCAP2_SVE2 (1 << 1)
-#endif
-#ifndef HWCAP2_SVEAES
-#define HWCAP2_SVEAES (1 << 2)
-#endif
-#ifndef HWCAP2_SVEPMULL
-#define HWCAP2_SVEPMULL (1 << 3)
-#endif
-#ifndef HWCAP2_SVEBITPERM
-#define HWCAP2_SVEBITPERM (1 << 4)
-#endif
-#ifndef HWCAP2_SVESHA3
-#define HWCAP2_SVESHA3 (1 << 5)
-#endif
-#ifndef HWCAP2_SVESM4
-#define HWCAP2_SVESM4 (1 << 6)
-#endif
-#ifndef HWCAP2_FLAGM2
-#define HWCAP2_FLAGM2 (1 << 7)
-#endif
-#ifndef HWCAP2_FRINT
-#define HWCAP2_FRINT (1 << 8)
-#endif
-#ifndef HWCAP2_SVEI8MM
-#define HWCAP2_SVEI8MM (1 << 9)
-#endif
-#ifndef HWCAP2_SVEF32MM
-#define HWCAP2_SVEF32MM (1 << 10)
-#endif
-#ifndef HWCAP2_SVEF64MM
-#define HWCAP2_SVEF64MM (1 << 11)
-#endif
-#ifndef HWCAP2_SVEBF16
-#define HWCAP2_SVEBF16 (1 << 12)
-#endif
-#ifndef HWCAP2_I8MM
-#define HWCAP2_I8MM (1 << 13)
-#endif
-#ifndef HWCAP2_BF16
-#define HWCAP2_BF16 (1 << 14)
-#endif
-#ifndef HWCAP2_DGH
-#define HWCAP2_DGH (1 << 15)
-#endif
-#ifndef HWCAP2_RNG
-#define HWCAP2_RNG (1 << 16)
-#endif
-#ifndef HWCAP2_BTI
-#define HWCAP2_BTI (1 << 17)
-#endif
-#ifndef HWCAP2_MTE
-#define HWCAP2_MTE (1 << 18)
-#endif
-#ifndef HWCAP2_RPRES
-#define HWCAP2_RPRES (1 << 21)
-#endif
-#ifndef HWCAP2_MTE3
-#define HWCAP2_MTE3 (1 << 22)
-#endif
-#ifndef HWCAP2_SME
-#define HWCAP2_SME (1 << 23)
-#endif
-#ifndef HWCAP2_SME_I16I64
-#define HWCAP2_SME_I16I64 (1 << 24)
-#endif
-#ifndef HWCAP2_SME_F64F64
-#define HWCAP2_SME_F64F64 (1 << 25)
-#endif
-#ifndef HWCAP2_WFXT
-#define HWCAP2_WFXT (1UL << 31)
-#endif
-#ifndef HWCAP2_EBF16
-#define HWCAP2_EBF16 (1UL << 32)
-#endif
-#ifndef HWCAP2_SVE_EBF16
-#define HWCAP2_SVE_EBF16 (1UL << 33)
-#endif
-
-// LSE support detection for out-of-line atomics
-// using HWCAP and Auxiliary vector
-_Bool __aarch64_have_lse_atomics
-    __attribute__((visibility("hidden"), nocommon));
-
-#if defined(__has_include)
-#if __has_include(<sys/auxv.h>)
-#include <sys/auxv.h>
-#if __has_include(<asm/hwcap.h>)
-#include <asm/hwcap.h>
-
-#if defined(__ANDROID__)
-#include <string.h>
-#include <sys/system_properties.h>
-#elif defined(__Fuchsia__)
-#include <zircon/features.h>
-#include <zircon/syscalls.h>
-#endif
-
-// Detect Exynos 9810 CPU
-#define IF_EXYNOS9810                                                          \
-  char arch[PROP_VALUE_MAX];                                                   \
-  if (__system_property_get("ro.arch", arch) > 0 &&                            \
-      strncmp(arch, "exynos9810", sizeof("exynos9810") - 1) == 0)
-
-static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) {
-#if defined(__FreeBSD__)
-  unsigned long hwcap;
-  int result = elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
-  __aarch64_have_lse_atomics = result == 0 && (hwcap & HWCAP_ATOMICS) != 0;
-#elif defined(__Fuchsia__)
-  // This ensures the vDSO is a direct link-time dependency of anything that
-  // needs this initializer code.
-#pragma comment(lib, "zircon")
-  uint32_t features;
-  zx_status_t status = _zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
-  __aarch64_have_lse_atomics =
-      status == ZX_OK && (features & ZX_ARM64_FEATURE_ISA_ATOMICS) != 0;
-#else
-  unsigned long hwcap = getauxval(AT_HWCAP);
-  _Bool result = (hwcap & HWCAP_ATOMICS) != 0;
-#if defined(__ANDROID__)
-  if (result) {
-    // Some cores in the Exynos 9810 CPU are ARMv8.2 and others are ARMv8.0;
-    // only the former support LSE atomics.  However, the kernel in the
-    // initial Android 8.0 release of Galaxy S9/S9+ devices incorrectly
-    // reported the feature as being supported.
-    //
-    // The kernel appears to have been corrected to mark it unsupported as of
-    // the Android 9.0 release on those devices, and this issue has not been
-    // observed anywhere else. Thus, this workaround may be removed if
-    // compiler-rt ever drops support for Android 8.0.
-    IF_EXYNOS9810 result = false;
-  }
-#endif // defined(__ANDROID__)
-  __aarch64_have_lse_atomics = result;
-#endif // defined(__FreeBSD__)
-}
-
-#if !defined(DISABLE_AARCH64_FMV)
-// CPUFeatures must correspond to the same AArch64 features in
-// AArch64TargetParser.h
-enum CPUFeatures {
-  FEAT_RNG,
-  FEAT_FLAGM,
-  FEAT_FLAGM2,
-  FEAT_FP16FML,
-  FEAT_DOTPROD,
-  FEAT_SM4,
-  FEAT_RDM,
-  FEAT_LSE,
-  FEAT_FP,
-  FEAT_SIMD,
-  FEAT_CRC,
-  FEAT_SHA1,
-  FEAT_SHA2,
-  FEAT_SHA3,
-  FEAT_AES,
-  FEAT_PMULL,
-  FEAT_FP16,
-  FEAT_DIT,
-  FEAT_DPB,
-  FEAT_DPB2,
-  FEAT_JSCVT,
-  FEAT_FCMA,
-  FEAT_RCPC,
-  FEAT_RCPC2,
-  FEAT_FRINTTS,
-  FEAT_DGH,
-  FEAT_I8MM,
-  FEAT_BF16,
-  FEAT_EBF16,
-  FEAT_RPRES,
-  FEAT_SVE,
-  FEAT_SVE_BF16,
-  FEAT_SVE_EBF16,
-  FEAT_SVE_I8MM,
-  FEAT_SVE_F32MM,
-  FEAT_SVE_F64MM,
-  FEAT_SVE2,
-  FEAT_SVE_AES,
-  FEAT_SVE_PMULL128,
-  FEAT_SVE_BITPERM,
-  FEAT_SVE_SHA3,
-  FEAT_SVE_SM4,
-  FEAT_SME,
-  FEAT_MEMTAG,
-  FEAT_MEMTAG2,
-  FEAT_MEMTAG3,
-  FEAT_SB,
-  FEAT_PREDRES,
-  FEAT_SSBS,
-  FEAT_SSBS2,
-  FEAT_BTI,
-  FEAT_LS64,
-  FEAT_LS64_V,
-  FEAT_LS64_ACCDATA,
-  FEAT_WFXT,
-  FEAT_SME_F64,
-  FEAT_SME_I64,
-  FEAT_SME2,
-  FEAT_MAX
-};
-
-// Architecture features used
-// in Function Multi Versioning
-struct {
-  unsigned long long features;
-  // As features grows new fields could be added
-} __aarch64_cpu_features __attribute__((visibility("hidden"), nocommon));
-
-void init_cpu_features_resolver(unsigned long hwcap, unsigned long hwcap2) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
-#define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
-#define extractBits(val, start, number)                                        \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
-  if (hwcap & HWCAP_CRC32)
-    setCPUFeature(FEAT_CRC);
-  if (hwcap & HWCAP_PMULL)
-    setCPUFeature(FEAT_PMULL);
-  if (hwcap & HWCAP_FLAGM)
-    setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-    setCPUFeature(FEAT_FLAGM);
-    setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
-    setCPUFeature(FEAT_SM4);
-  if (hwcap & HWCAP_ASIMDDP)
-    setCPUFeature(FEAT_DOTPROD);
-  if (hwcap & HWCAP_ASIMDFHM)
-    setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
-    setCPUFeature(FEAT_FP16);
-    setCPUFeature(FEAT_FP);
-  }
-  if (hwcap & HWCAP_DIT)
-    setCPUFeature(FEAT_DIT);
-  if (hwcap & HWCAP_ASIMDRDM)
-    setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-    setCPUFeature(FEAT_RCPC2);
-  if (hwcap & HWCAP_AES)
-    setCPUFeature(FEAT_AES);
-  if (hwcap & HWCAP_SHA1)
-    setCPUFeature(FEAT_SHA1);
-  if (hwcap & HWCAP_SHA2)
-    setCPUFeature(FEAT_SHA2);
-  if (hwcap & HWCAP_JSCVT)
-    setCPUFeature(FEAT_JSCVT);
-  if (hwcap & HWCAP_FCMA)
-    setCPUFeature(FEAT_FCMA);
-  if (hwcap & HWCAP_SB)
-    setCPUFeature(FEAT_SB);
-  if (hwcap & HWCAP_SSBS)
-    setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-    setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-    setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
-    setCPUFeature(FEAT_MEMTAG3);
-  }
-  if (hwcap2 & HWCAP2_SVEAES)
-    setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-    setCPUFeature(FEAT_SVE_AES);
-    setCPUFeature(FEAT_SVE_PMULL128);
-  }
-  if (hwcap2 & HWCAP2_SVEBITPERM)
-    setCPUFeature(FEAT_SVE_BITPERM);
-  if (hwcap2 & HWCAP2_SVESHA3)
-    setCPUFeature(FEAT_SVE_SHA3);
-  if (hwcap2 & HWCAP2_SVESM4)
-    setCPUFeature(FEAT_SVE_SM4);
-  if (hwcap2 & HWCAP2_DCPODP)
-    setCPUFeature(FEAT_DPB2);
-  if (hwcap & HWCAP_ATOMICS)
-    setCPUFeature(FEAT_LSE);
-  if (hwcap2 & HWCAP2_RNG)
-    setCPUFeature(FEAT_RNG);
-  if (hwcap2 & HWCAP2_I8MM)
-    setCPUFeature(FEAT_I8MM);
-  if (hwcap2 & HWCAP2_EBF16)
-    setCPUFeature(FEAT_EBF16);
-  if (hwcap2 & HWCAP2_SVE_EBF16)
-    setCPUFeature(FEAT_SVE_EBF16);
-  if (hwcap2 & HWCAP2_DGH)
-    setCPUFeature(FEAT_DGH);
-  if (hwcap2 & HWCAP2_FRINT)
-    setCPUFeature(FEAT_FRINTTS);
-  if (hwcap2 & HWCAP2_SVEI8MM)
-    setCPUFeature(FEAT_SVE_I8MM);
-  if (hwcap2 & HWCAP2_SVEF32MM)
-    setCPUFeature(FEAT_SVE_F32MM);
-  if (hwcap2 & HWCAP2_SVEF64MM)
-    setCPUFeature(FEAT_SVE_F64MM);
-  if (hwcap2 & HWCAP2_BTI)
-    setCPUFeature(FEAT_BTI);
-  if (hwcap2 & HWCAP2_RPRES)
-    setCPUFeature(FEAT_RPRES);
-  if (hwcap2 & HWCAP2_WFXT)
-    setCPUFeature(FEAT_WFXT);
-  if (hwcap2 & HWCAP2_SME)
-    setCPUFeature(FEAT_SME);
-  if (hwcap2 & HWCAP2_SME_I16I64)
-    setCPUFeature(FEAT_SME_I64);
-  if (hwcap2 & HWCAP2_SME_F64F64)
-    setCPUFeature(FEAT_SME_F64);
-  if (hwcap & HWCAP_CPUID) {
-    unsigned long ftr;
-    getCPUFeature(ID_AA64PFR1_EL1, ftr);
-    // ID_AA64PFR1_EL1.MTE >= 0b0001
-    if (extractBits(ftr, 8, 4) >= 0x1)
-      setCPUFeature(FEAT_MEMTAG);
-    // ID_AA64PFR1_EL1.SSBS == 0b0001
-    if (extractBits(ftr, 4, 4) == 0x1)
-      setCPUFeature(FEAT_SSBS);
-    // ID_AA64PFR1_EL1.SME == 0b0010
-    if (extractBits(ftr, 24, 4) == 0x2)
-      setCPUFeature(FEAT_SME2);
-    getCPUFeature(ID_AA64PFR0_EL1, ftr);
-    // ID_AA64PFR0_EL1.FP != 0b1111
-    if (extractBits(ftr, 16, 4) != 0xF) {
-      setCPUFeature(FEAT_FP);
-      // ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP
-      setCPUFeature(FEAT_SIMD);
-    }
-    // ID_AA64PFR0_EL1.SVE != 0b0000
-    if (extractBits(ftr, 32, 4) != 0x0) {
-      // get ID_AA64ZFR0_EL1, that name supported
-      // if sve enabled only
-      getCPUFeature(S3_0_C0_C4_4, ftr);
-      // ID_AA64ZFR0_EL1.SVEver == 0b0000
-      if (extractBits(ftr, 0, 4) == 0x0)
-        setCPUFeature(FEAT_SVE);
-      // ID_AA64ZFR0_EL1.SVEver == 0b0001
-      if (extractBits(ftr, 0, 4) == 0x1)
-        setCPUFeature(FEAT_SVE2);
-      // ID_AA64ZFR0_EL1.BF16 != 0b0000
-      if (extractBits(ftr, 20, 4) != 0x0)
-        setCPUFeature(FEAT_SVE_BF16);
-    }
-    getCPUFeature(ID_AA64ISAR0_EL1, ftr);
-    // ID_AA64ISAR0_EL1.SHA3 != 0b0000
-    if (extractBits(ftr, 32, 4) != 0x0)
-      setCPUFeature(FEAT_SHA3);
-    getCPUFeature(ID_AA64ISAR1_EL1, ftr);
-    // ID_AA64ISAR1_EL1.DPB >= 0b0001
-    if (extractBits(ftr, 0, 4) >= 0x1)
-      setCPUFeature(FEAT_DPB);
-    // ID_AA64ISAR1_EL1.LRCPC != 0b0000
-    if (extractBits(ftr, 20, 4) != 0x0)
-      setCPUFeature(FEAT_RCPC);
-    // ID_AA64ISAR1_EL1.SPECRES == 0b0001
-    if (extractBits(ftr, 40, 4) == 0x2)
-      setCPUFeature(FEAT_PREDRES);
-    // ID_AA64ISAR1_EL1.BF16 != 0b0000
-    if (extractBits(ftr, 44, 4) != 0x0)
-      setCPUFeature(FEAT_BF16);
-    // ID_AA64ISAR1_EL1.LS64 >= 0b0001
-    if (extractBits(ftr, 60, 4) >= 0x1)
-      setCPUFeature(FEAT_LS64);
-    // ID_AA64ISAR1_EL1.LS64 >= 0b0010
-    if (extractBits(ftr, 60, 4) >= 0x2)
-      setCPUFeature(FEAT_LS64_V);
-    // ID_AA64ISAR1_EL1.LS64 >= 0b0011
-    if (extractBits(ftr, 60, 4) >= 0x3)
-      setCPUFeature(FEAT_LS64_ACCDATA);
-  } else {
-    // Set some features in case of no CPUID support
-    if (hwcap & (HWCAP_FP | HWCAP_FPHP)) {
-      setCPUFeature(FEAT_FP);
-      // FP and AdvSIMD fields have the same value
-      setCPUFeature(FEAT_SIMD);
-    }
-    if (hwcap & HWCAP_DCPOP || hwcap2 & HWCAP2_DCPODP)
-      setCPUFeature(FEAT_DPB);
-    if (hwcap & HWCAP_LRCPC || hwcap & HWCAP_ILRCPC)
-      setCPUFeature(FEAT_RCPC);
-    if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16)
-      setCPUFeature(FEAT_BF16);
-    if (hwcap2 & HWCAP2_SVEBF16)
-      setCPUFeature(FEAT_SVE_BF16);
-    if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE)
-      setCPUFeature(FEAT_SVE2);
-    if (hwcap & HWCAP_SHA3)
-      setCPUFeature(FEAT_SHA3);
-  }
-}
-
-void CONSTRUCTOR_ATTRIBUTE init_cpu_features(void) {
-  unsigned long hwcap;
-  unsigned long hwcap2;
-  // CPU features already initialized.
-  if (__aarch64_cpu_features.features)
-    return;
-  setCPUFeature(FEAT_MAX);
-#if defined(__FreeBSD__)
-  int res = 0;
-  res = elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
-  res |= elf_aux_info(AT_HWCAP2, &hwcap2, sizeof hwcap2);
-  if (res)
-    return;
-#else
-#if defined(__ANDROID__)
-  // Don't set any CPU features,
-  // detection could be wrong on Exynos 9810.
-  IF_EXYNOS9810 return;
-#endif // defined(__ANDROID__)
-  hwcap = getauxval(AT_HWCAP);
-  hwcap2 = getauxval(AT_HWCAP2);
-#endif // defined(__FreeBSD__)
-  init_cpu_features_resolver(hwcap, hwcap2);
-#undef extractBits
-#undef getCPUFeature
-#undef setCPUFeature
-#undef IF_EXYNOS9810
-}
-#endif // !defined(DISABLE_AARCH64_FMV)
-#endif // defined(__has_include)
-#endif // __has_include(<sys/auxv.h>)
-#endif // __has_include(<asm/hwcap.h>)
-#endif // defined(__aarch64__)
diff --git a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
new file mode 100644
index 0000000..e78bb88
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
@@ -0,0 +1,91 @@
+//===- AArch64CPUFeatures.inc - AArch64 CPU Features enum -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the CPUFeatures enum for AArch64 to facilitate better
+// testing of this code between LLVM and compiler-rt, primarily that the files
+// are an exact match.
+//
+// This file has two identical copies. The primary copy lives in LLVM and
+// the other one sits in compiler-rt/lib/builtins/cpu_model directory. To make
+// changes in this file, first modify the primary copy and copy it over to
+// compiler-rt. compiler-rt tests will fail if the two files are not synced up.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64_CPU_FEATURS_INC_H
+#define AARCH64_CPU_FEATURS_INC_H
+
+// Function Multi Versioning CPU features.
+enum CPUFeatures {
+  FEAT_RNG,
+  FEAT_FLAGM,
+  FEAT_FLAGM2,
+  FEAT_FP16FML,
+  FEAT_DOTPROD,
+  FEAT_SM4,
+  FEAT_RDM,
+  FEAT_LSE,
+  FEAT_FP,
+  FEAT_SIMD,
+  FEAT_CRC,
+  FEAT_SHA1,
+  FEAT_SHA2,
+  FEAT_SHA3,
+  FEAT_AES,
+  FEAT_PMULL,
+  FEAT_FP16,
+  FEAT_DIT,
+  FEAT_DPB,
+  FEAT_DPB2,
+  FEAT_JSCVT,
+  FEAT_FCMA,
+  FEAT_RCPC,
+  FEAT_RCPC2,
+  FEAT_FRINTTS,
+  FEAT_DGH,
+  FEAT_I8MM,
+  FEAT_BF16,
+  FEAT_EBF16,
+  FEAT_RPRES,
+  FEAT_SVE,
+  FEAT_SVE_BF16,
+  FEAT_SVE_EBF16,
+  FEAT_SVE_I8MM,
+  FEAT_SVE_F32MM,
+  FEAT_SVE_F64MM,
+  FEAT_SVE2,
+  FEAT_SVE_AES,
+  FEAT_SVE_PMULL128,
+  FEAT_SVE_BITPERM,
+  FEAT_SVE_SHA3,
+  FEAT_SVE_SM4,
+  FEAT_SME,
+  FEAT_MEMTAG,
+  FEAT_MEMTAG2,
+  FEAT_MEMTAG3,
+  FEAT_SB,
+  FEAT_PREDRES,
+  FEAT_SSBS,
+  FEAT_SSBS2,
+  FEAT_BTI,
+  FEAT_LS64,
+  FEAT_LS64_V,
+  FEAT_LS64_ACCDATA,
+  FEAT_WFXT,
+  FEAT_SME_F64,
+  FEAT_SME_I64,
+  FEAT_SME2,
+  FEAT_RCPC3,
+  FEAT_MOPS,
+  FEAT_MAX,
+  FEAT_EXT = 62, // Reserved to indicate presence of additional features field
+                 // in __aarch64_cpu_features
+  FEAT_INIT      // Used as flag of features initialization completion
+};
+
+#endif
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
new file mode 100644
index 0000000..0dd3977
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -0,0 +1,84 @@
+//===-- cpu_model/aarch64.c - Support for __cpu_model builtin  ----*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file is based on LLVM's lib/Support/Host.cpp.
+//  It implements __aarch64_have_lse_atomics, __aarch64_cpu_features for
+//  AArch64.
+//
+//===----------------------------------------------------------------------===//
+
+#include "aarch64.h"
+
+#if !defined(__aarch64__)
+#error This file is intended only for aarch64-based targets
+#endif
+
+#if __has_include(<sys/ifunc.h>)
+#include <sys/ifunc.h>
+#else
+typedef struct __ifunc_arg_t {
+  unsigned long _size;
+  unsigned long _hwcap;
+  unsigned long _hwcap2;
+} __ifunc_arg_t;
+#endif // __has_include(<sys/ifunc.h>)
+
+// LSE support detection for out-of-line atomics
+// using HWCAP and Auxiliary vector
+_Bool __aarch64_have_lse_atomics
+    __attribute__((visibility("hidden"), nocommon)) = false;
+
+#if defined(__FreeBSD__)
+// clang-format off: should not reorder sys/auxv.h alphabetically
+#include <sys/auxv.h>
+// clang-format on
+#include "aarch64/hwcap.inc"
+#include "aarch64/lse_atomics/freebsd.inc"
+#elif defined(__Fuchsia__)
+#include "aarch64/hwcap.inc"
+#include "aarch64/lse_atomics/fuchsia.inc"
+#elif defined(__ANDROID__)
+#include "aarch64/hwcap.inc"
+#include "aarch64/lse_atomics/android.inc"
+#elif defined(__linux__) && __has_include(<sys/auxv.h>)
+#include "aarch64/hwcap.inc"
+#include "aarch64/lse_atomics/getauxval.inc"
+#else
+// When unimplemented, we leave __aarch64_have_lse_atomics initialized to false.
+#endif
+
+#if !defined(DISABLE_AARCH64_FMV)
+
+// Architecture features used
+// in Function Multi Versioning
+struct {
+  unsigned long long features;
+  // As features grows new fields could be added
+} __aarch64_cpu_features __attribute__((visibility("hidden"), nocommon));
+
+// The formatter wants to re-order these includes, but doing so is incorrect:
+// clang-format off
+#if defined(__APPLE__)
+#include "aarch64/fmv/apple.inc"
+#elif defined(__FreeBSD__)
+#include "aarch64/fmv/mrs.inc"
+#include "aarch64/fmv/freebsd.inc"
+#elif defined(__Fuchsia__)
+#include "aarch64/fmv/fuchsia.inc"
+#elif defined(__ANDROID__)
+#include "aarch64/fmv/mrs.inc"
+#include "aarch64/fmv/android.inc"
+#elif defined(__linux__) && __has_include(<sys/auxv.h>)
+#include "aarch64/fmv/mrs.inc"
+#include "aarch64/fmv/getauxval.inc"
+#else
+#include "aarch64/fmv/unimplemented.inc"
+#endif
+// clang-format on
+
+#endif // !defined(DISABLE_AARCH64_FMV)
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.h b/compiler-rt/lib/builtins/cpu_model/aarch64.h
new file mode 100644
index 0000000..f6cbf75
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.h
@@ -0,0 +1,21 @@
+//===-- cpu_model/aarch64.h --------------------------------------------- -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "cpu_model.h"
+
+#if !defined(__aarch64__)
+#error This file is intended only for aarch64-based targets
+#endif
+
+#if !defined(DISABLE_AARCH64_FMV)
+
+#include "AArch64CPUFeatures.inc"
+
+void __init_cpu_features(void);
+
+#endif // !defined(DISABLE_AARCH64_FMV)
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/android.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/android.inc
new file mode 100644
index 0000000..a9e3594
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/android.inc
@@ -0,0 +1,36 @@
+void __init_cpu_features_resolver(unsigned long hwcap,
+                                  const __ifunc_arg_t *arg) {
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  // ifunc resolvers don't have hwcaps in arguments on Android API lower
+  // than 30. If so, set feature detection done and keep all CPU features
+  // unsupported (zeros). To detect this case in runtime we check existence
+  // of memfd_create function from Standard C library which was introduced in
+  // Android API 30.
+  int memfd_create(const char *, unsigned int) __attribute__((weak));
+  if (!memfd_create)
+    return;
+
+  __init_cpu_features_constructor(hwcap, arg);
+}
+
+void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) {
+  // CPU features already initialized.
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  // Don't set any CPU features,
+  // detection could be wrong on Exynos 9810.
+  if (__isExynos9810())
+    return;
+
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+
+  __ifunc_arg_t arg;
+  arg._size = sizeof(__ifunc_arg_t);
+  arg._hwcap = hwcap;
+  arg._hwcap2 = hwcap2;
+  __init_cpu_features_constructor(hwcap | _IFUNC_ARG_HWCAP, &arg);
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
new file mode 100644
index 0000000..f069490
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
@@ -0,0 +1,159 @@
+#include <TargetConditionals.h>
+#if TARGET_OS_OSX || TARGET_OS_IPHONE
+#include <sys/sysctl.h>
+
+#if __has_include(<arm/cpu_capabilities_public.h>)
+#include <arm/cpu_capabilities_public.h>
+#define HAS_CPU_CAPABILITIES_PUBLIC_H 1
+
+// FB13964283 - A few of these didn't make it into the public SDK yet.
+#ifndef CAP_BIT_FEAT_SME
+#define CAP_BIT_FEAT_SME            40
+#endif
+#ifndef CAP_BIT_FEAT_SME2
+#define CAP_BIT_FEAT_SME2           41
+#endif
+#ifndef CAP_BIT_FEAT_SME_F64F64
+#define CAP_BIT_FEAT_SME_F64F64     42
+#endif
+#ifndef CAP_BIT_FEAT_SME_I16I64
+#define CAP_BIT_FEAT_SME_I16I64     43
+#endif
+
+#endif
+
+static bool isKnownAndSupported(const char *name) {
+  int32_t val = 0;
+  size_t size = sizeof(val);
+  if (sysctlbyname(name, &val, &size, NULL, 0))
+    return false;
+  return val;
+}
+
+static uint64_t deriveImplicitFeatures(uint64_t features) {
+  // FEAT_SSBS2 implies FEAT_SSBS
+  if ((1ULL << FEAT_SSBS2) & features)
+    features |= (1ULL << FEAT_SSBS);
+
+  // FEAT_FP is always enabled
+  features |= (1ULL << FEAT_FP);
+
+  features |= (1ULL << FEAT_INIT);
+
+  return features;
+}
+
+void __init_cpu_features_resolver(void) {
+  // On Darwin platforms, this may be called concurrently by multiple threads
+  // because the resolvers that use it are called lazily at runtime (unlike on
+  // ELF platforms, where IFuncs are resolved serially at load time). This
+  // function's effect on __aarch64_cpu_features must be idempotent.
+
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  uint64_t features = 0;
+
+#ifdef HAS_CPU_CAPABILITIES_PUBLIC_H
+  uint8_t feats_bitvec[(CAP_BIT_NB + 7) / 8] = {0};
+  size_t len = sizeof(feats_bitvec);
+  // When hw.optional.arm.feats is available (macOS 15.0+, iOS 18.0+), use the
+  // fast path to get all the feature bits, otherwise fall back to the slow
+  // ~20-something sysctls path.
+  if (!sysctlbyname("hw.optional.arm.caps", &feats_bitvec, &len, 0, 0)) {
+
+#define CHECK_BIT(FROM, TO)                                                    \
+  do {                                                                         \
+    if (feats_bitvec[FROM / 8] & (1u << ((FROM) & 7))) {                       \
+      features |= (1ULL << TO);                                                \
+    }                                                                          \
+  } while (0)
+
+    CHECK_BIT(CAP_BIT_FEAT_FlagM, FEAT_FLAGM);
+    CHECK_BIT(CAP_BIT_FEAT_FlagM2, FEAT_FLAGM2);
+    CHECK_BIT(CAP_BIT_FEAT_FHM, FEAT_FP16FML);
+    CHECK_BIT(CAP_BIT_FEAT_DotProd, FEAT_DOTPROD);
+    CHECK_BIT(CAP_BIT_FEAT_SHA3, FEAT_SHA3);
+    CHECK_BIT(CAP_BIT_FEAT_RDM, FEAT_RDM);
+    CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE);
+    CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2);
+    CHECK_BIT(CAP_BIT_FEAT_SHA1, FEAT_SHA1);
+    CHECK_BIT(CAP_BIT_FEAT_AES, FEAT_AES);
+    CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL);
+    CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES);
+    CHECK_BIT(CAP_BIT_FEAT_SB, FEAT_SB);
+    CHECK_BIT(CAP_BIT_FEAT_FRINTTS, FEAT_FRINTTS);
+    CHECK_BIT(CAP_BIT_FEAT_LRCPC, FEAT_RCPC);
+    CHECK_BIT(CAP_BIT_FEAT_LRCPC2, FEAT_RCPC2);
+    CHECK_BIT(CAP_BIT_FEAT_FCMA, FEAT_FCMA);
+    CHECK_BIT(CAP_BIT_FEAT_JSCVT, FEAT_JSCVT);
+    CHECK_BIT(CAP_BIT_FEAT_DPB, FEAT_DPB);
+    CHECK_BIT(CAP_BIT_FEAT_DPB2, FEAT_DPB2);
+    CHECK_BIT(CAP_BIT_FEAT_BF16, FEAT_BF16);
+    CHECK_BIT(CAP_BIT_FEAT_I8MM, FEAT_I8MM);
+    CHECK_BIT(CAP_BIT_FEAT_DIT, FEAT_DIT);
+    CHECK_BIT(CAP_BIT_FEAT_FP16, FEAT_FP16);
+    CHECK_BIT(CAP_BIT_FEAT_SSBS, FEAT_SSBS2);
+    CHECK_BIT(CAP_BIT_FEAT_BTI, FEAT_BTI);
+    CHECK_BIT(CAP_BIT_AdvSIMD, FEAT_SIMD);
+    CHECK_BIT(CAP_BIT_CRC32, FEAT_CRC);
+    CHECK_BIT(CAP_BIT_FEAT_SME, FEAT_SME);
+    CHECK_BIT(CAP_BIT_FEAT_SME2, FEAT_SME2);
+    CHECK_BIT(CAP_BIT_FEAT_SME_F64F64, FEAT_SME_F64);
+    CHECK_BIT(CAP_BIT_FEAT_SME_I16I64, FEAT_SME_I64);
+
+    features = deriveImplicitFeatures(features);
+
+    __atomic_store(&__aarch64_cpu_features.features, &features,
+                   __ATOMIC_RELAXED);
+    return;
+  }
+#endif
+
+  // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+  static const struct {
+    const char *sysctl_name;
+    enum CPUFeatures feature;
+  } feature_checks[] = {
+      {"hw.optional.arm.FEAT_FlagM", FEAT_FLAGM},
+      {"hw.optional.arm.FEAT_FlagM2", FEAT_FLAGM2},
+      {"hw.optional.arm.FEAT_FHM", FEAT_FP16FML},
+      {"hw.optional.arm.FEAT_DotProd", FEAT_DOTPROD},
+      {"hw.optional.arm.FEAT_RDM", FEAT_RDM},
+      {"hw.optional.arm.FEAT_LSE", FEAT_LSE},
+      {"hw.optional.AdvSIMD", FEAT_SIMD},
+      {"hw.optional.armv8_crc32", FEAT_CRC},
+      {"hw.optional.arm.FEAT_SHA1", FEAT_SHA1},
+      {"hw.optional.arm.FEAT_SHA256", FEAT_SHA2},
+      {"hw.optional.arm.FEAT_SHA3", FEAT_SHA3},
+      {"hw.optional.arm.FEAT_AES", FEAT_AES},
+      {"hw.optional.arm.FEAT_PMULL", FEAT_PMULL},
+      {"hw.optional.arm.FEAT_FP16", FEAT_FP16},
+      {"hw.optional.arm.FEAT_DIT", FEAT_DIT},
+      {"hw.optional.arm.FEAT_DPB", FEAT_DPB},
+      {"hw.optional.arm.FEAT_DPB2", FEAT_DPB2},
+      {"hw.optional.arm.FEAT_JSCVT", FEAT_JSCVT},
+      {"hw.optional.arm.FEAT_FCMA", FEAT_FCMA},
+      {"hw.optional.arm.FEAT_LRCPC", FEAT_RCPC},
+      {"hw.optional.arm.FEAT_LRCPC2", FEAT_RCPC2},
+      {"hw.optional.arm.FEAT_FRINTTS", FEAT_FRINTTS},
+      {"hw.optional.arm.FEAT_I8MM", FEAT_I8MM},
+      {"hw.optional.arm.FEAT_BF16", FEAT_BF16},
+      {"hw.optional.arm.FEAT_SB", FEAT_SB},
+      {"hw.optional.arm.FEAT_SPECRES", FEAT_PREDRES},
+      {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS2},
+      {"hw.optional.arm.FEAT_BTI", FEAT_BTI},
+  };
+
+  for (size_t I = 0, E = sizeof(feature_checks) / sizeof(feature_checks[0]);
+        I != E; ++I)
+    if (isKnownAndSupported(feature_checks[I].sysctl_name))
+      features |= (1ULL << feature_checks[I].feature);
+
+  features = deriveImplicitFeatures(features);
+
+  __atomic_store(&__aarch64_cpu_features.features, &features,
+                  __ATOMIC_RELAXED);
+}
+
+#endif // TARGET_OS_OSX || TARGET_OS_IPHONE
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/freebsd.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/freebsd.inc
new file mode 100644
index 0000000..aa975dc
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/freebsd.inc
@@ -0,0 +1,27 @@
+void __init_cpu_features_resolver(unsigned long hwcap,
+                                  const __ifunc_arg_t *arg) {
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  __init_cpu_features_constructor(hwcap, arg);
+}
+
+void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) {
+  unsigned long hwcap = 0;
+  unsigned long hwcap2 = 0;
+  // CPU features already initialized.
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  int res = 0;
+  res = elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
+  res |= elf_aux_info(AT_HWCAP2, &hwcap2, sizeof hwcap2);
+  if (res)
+    return;
+
+  __ifunc_arg_t arg;
+  arg._size = sizeof(__ifunc_arg_t);
+  arg._hwcap = hwcap;
+  arg._hwcap2 = hwcap2;
+  __init_cpu_features_constructor(hwcap | _IFUNC_ARG_HWCAP, &arg);
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
new file mode 100644
index 0000000..1ae4780
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
@@ -0,0 +1,53 @@
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+void __init_cpu_features_resolver() {
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+    // This ensures the vDSO is a direct link-time dependency of anything that
+    // needs this initializer code.
+#pragma comment(lib, "zircon")
+  uint32_t features;
+  zx_status_t status = _zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  if (status != ZX_OK)
+    return;
+
+  unsigned long long feat = 0;
+#define setCPUFeature(cpu_feature) feat |= 1ULL << cpu_feature
+
+  if (features & ZX_ARM64_FEATURE_ISA_FP)
+    setCPUFeature(FEAT_FP);
+  if (features & ZX_ARM64_FEATURE_ISA_ASIMD)
+    setCPUFeature(FEAT_SIMD);
+  if (features & ZX_ARM64_FEATURE_ISA_AES)
+    setCPUFeature(FEAT_AES);
+  if (features & ZX_ARM64_FEATURE_ISA_PMULL)
+    setCPUFeature(FEAT_PMULL);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA1)
+    setCPUFeature(FEAT_SHA1);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA256)
+    setCPUFeature(FEAT_SHA2);
+  if (features & ZX_ARM64_FEATURE_ISA_CRC32)
+    setCPUFeature(FEAT_CRC);
+  if (features & ZX_ARM64_FEATURE_ISA_RDM)
+    setCPUFeature(FEAT_RDM);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA3)
+    setCPUFeature(FEAT_SHA3);
+  if (features & ZX_ARM64_FEATURE_ISA_SM4)
+    setCPUFeature(FEAT_SM4);
+  if (features & ZX_ARM64_FEATURE_ISA_DP)
+    setCPUFeature(FEAT_DOTPROD);
+  if (features & ZX_ARM64_FEATURE_ISA_FHM)
+    setCPUFeature(FEAT_FP16FML);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA512)
+    setCPUFeature(FEAT_SHA3);
+  if (features & ZX_ARM64_FEATURE_ISA_I8MM)
+    setCPUFeature(FEAT_I8MM);
+  if (features & ZX_ARM64_FEATURE_ISA_SVE)
+    setCPUFeature(FEAT_SVE);
+
+  setCPUFeature(FEAT_INIT);
+
+  __atomic_store_n(&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED);
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/getauxval.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/getauxval.inc
new file mode 100644
index 0000000..486f77a
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/getauxval.inc
@@ -0,0 +1,21 @@
+void __init_cpu_features_resolver(unsigned long hwcap,
+                                  const __ifunc_arg_t *arg) {
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+  __init_cpu_features_constructor(hwcap, arg);
+}
+
+void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) {
+  // CPU features already initialized.
+  if (__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED))
+    return;
+
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+
+  __ifunc_arg_t arg;
+  arg._size = sizeof(__ifunc_arg_t);
+  arg._hwcap = hwcap;
+  arg._hwcap2 = hwcap2;
+  __init_cpu_features_constructor(hwcap | _IFUNC_ARG_HWCAP, &arg);
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
new file mode 100644
index 0000000..e4d5e7f
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
@@ -0,0 +1,149 @@
+#if __has_include(<sys/auxv.h>)
+#include <sys/auxv.h>
+#define HAVE_SYS_AUXV_H
+#endif
+
+static void __init_cpu_features_constructor(unsigned long hwcap,
+                                            const __ifunc_arg_t *arg) {
+  unsigned long long feat = 0;
+#define setCPUFeature(F) feat |= 1ULL << F
+#define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
+#define extractBits(val, start, number)                                        \
+  (val & ((1ULL << number) - 1ULL) << start) >> start
+  unsigned long hwcap2 = 0;
+  if (hwcap & _IFUNC_ARG_HWCAP)
+    hwcap2 = arg->_hwcap2;
+  if (hwcap & HWCAP_CRC32)
+    setCPUFeature(FEAT_CRC);
+  if (hwcap & HWCAP_PMULL)
+    setCPUFeature(FEAT_PMULL);
+  if (hwcap & HWCAP_FLAGM)
+    setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
+    setCPUFeature(FEAT_FLAGM2);
+  if (hwcap & HWCAP_SM4)
+    setCPUFeature(FEAT_SM4);
+  if (hwcap & HWCAP_ASIMDDP)
+    setCPUFeature(FEAT_DOTPROD);
+  if (hwcap & HWCAP_ASIMDFHM)
+    setCPUFeature(FEAT_FP16FML);
+  if (hwcap & HWCAP_FPHP)
+    setCPUFeature(FEAT_FP16);
+  if (hwcap & HWCAP_DIT)
+    setCPUFeature(FEAT_DIT);
+  if (hwcap & HWCAP_ASIMDRDM)
+    setCPUFeature(FEAT_RDM);
+  if (hwcap & HWCAP_AES)
+    setCPUFeature(FEAT_AES);
+  if (hwcap & HWCAP_SHA1)
+    setCPUFeature(FEAT_SHA1);
+  if (hwcap & HWCAP_SHA2)
+    setCPUFeature(FEAT_SHA2);
+  if (hwcap & HWCAP_JSCVT)
+    setCPUFeature(FEAT_JSCVT);
+  if (hwcap & HWCAP_FCMA)
+    setCPUFeature(FEAT_FCMA);
+  if (hwcap & HWCAP_SB)
+    setCPUFeature(FEAT_SB);
+  if (hwcap & HWCAP_SSBS) {
+    setCPUFeature(FEAT_SSBS);
+    setCPUFeature(FEAT_SSBS2);
+  }
+  if (hwcap2 & HWCAP2_MTE) {
+    setCPUFeature(FEAT_MEMTAG);
+    setCPUFeature(FEAT_MEMTAG2);
+  }
+  if (hwcap2 & HWCAP2_MTE3)
+    setCPUFeature(FEAT_MEMTAG3);
+  if (hwcap2 & HWCAP2_SVEAES)
+    setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
+    setCPUFeature(FEAT_SVE_PMULL128);
+  if (hwcap2 & HWCAP2_SVEBITPERM)
+    setCPUFeature(FEAT_SVE_BITPERM);
+  if (hwcap2 & HWCAP2_SVESHA3)
+    setCPUFeature(FEAT_SVE_SHA3);
+  if (hwcap2 & HWCAP2_SVESM4)
+    setCPUFeature(FEAT_SVE_SM4);
+  if (hwcap2 & HWCAP2_DCPODP)
+    setCPUFeature(FEAT_DPB2);
+  if (hwcap & HWCAP_ATOMICS)
+    setCPUFeature(FEAT_LSE);
+  if (hwcap2 & HWCAP2_RNG)
+    setCPUFeature(FEAT_RNG);
+  if (hwcap2 & HWCAP2_I8MM)
+    setCPUFeature(FEAT_I8MM);
+  if (hwcap2 & HWCAP2_EBF16)
+    setCPUFeature(FEAT_EBF16);
+  if (hwcap2 & HWCAP2_SVE_EBF16)
+    setCPUFeature(FEAT_SVE_EBF16);
+  if (hwcap2 & HWCAP2_DGH)
+    setCPUFeature(FEAT_DGH);
+  if (hwcap2 & HWCAP2_FRINT)
+    setCPUFeature(FEAT_FRINTTS);
+  if (hwcap2 & HWCAP2_SVEI8MM)
+    setCPUFeature(FEAT_SVE_I8MM);
+  if (hwcap2 & HWCAP2_SVEF32MM)
+    setCPUFeature(FEAT_SVE_F32MM);
+  if (hwcap2 & HWCAP2_SVEF64MM)
+    setCPUFeature(FEAT_SVE_F64MM);
+  if (hwcap2 & HWCAP2_BTI)
+    setCPUFeature(FEAT_BTI);
+  if (hwcap2 & HWCAP2_RPRES)
+    setCPUFeature(FEAT_RPRES);
+  if (hwcap2 & HWCAP2_WFXT)
+    setCPUFeature(FEAT_WFXT);
+  if (hwcap2 & HWCAP2_SME)
+    setCPUFeature(FEAT_SME);
+  if (hwcap2 & HWCAP2_SME2)
+    setCPUFeature(FEAT_SME2);
+  if (hwcap2 & HWCAP2_SME_I16I64)
+    setCPUFeature(FEAT_SME_I64);
+  if (hwcap2 & HWCAP2_SME_F64F64)
+    setCPUFeature(FEAT_SME_F64);
+  if (hwcap2 & HWCAP2_MOPS)
+    setCPUFeature(FEAT_MOPS);
+  if (hwcap & HWCAP_CPUID) {
+    unsigned long ftr;
+
+    getCPUFeature(ID_AA64ISAR1_EL1, ftr);
+    /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001  */
+    if (extractBits(ftr, 40, 4) >= 0x1)
+      setCPUFeature(FEAT_PREDRES);
+    /* ID_AA64ISAR1_EL1.LS64 >= 0b0001  */
+    if (extractBits(ftr, 60, 4) >= 0x1)
+      setCPUFeature(FEAT_LS64);
+    /* ID_AA64ISAR1_EL1.LS64 >= 0b0010  */
+    if (extractBits(ftr, 60, 4) >= 0x2)
+      setCPUFeature(FEAT_LS64_V);
+    /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
+    if (extractBits(ftr, 60, 4) >= 0x3)
+      setCPUFeature(FEAT_LS64_ACCDATA);
+  }
+  if (hwcap & HWCAP_FP) {
+    setCPUFeature(FEAT_FP);
+    // FP and AdvSIMD fields have the same value
+    setCPUFeature(FEAT_SIMD);
+  }
+  if (hwcap & HWCAP_DCPOP)
+    setCPUFeature(FEAT_DPB);
+  if (hwcap & HWCAP_LRCPC)
+    setCPUFeature(FEAT_RCPC);
+  if (hwcap & HWCAP_ILRCPC)
+    setCPUFeature(FEAT_RCPC2);
+  if (hwcap2 & HWCAP2_LRCPC3)
+    setCPUFeature(FEAT_RCPC3);
+  if (hwcap2 & HWCAP2_BF16)
+    setCPUFeature(FEAT_BF16);
+  if (hwcap2 & HWCAP2_SVEBF16)
+    setCPUFeature(FEAT_SVE_BF16);
+  if (hwcap & HWCAP_SVE)
+    setCPUFeature(FEAT_SVE);
+  if (hwcap2 & HWCAP2_SVE2)
+    setCPUFeature(FEAT_SVE2);
+  if (hwcap & HWCAP_SHA3)
+    setCPUFeature(FEAT_SHA3);
+  setCPUFeature(FEAT_INIT);
+
+  __atomic_store_n(&__aarch64_cpu_features.features, feat, __ATOMIC_RELAXED);
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/unimplemented.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/unimplemented.inc
new file mode 100644
index 0000000..dc34624
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/unimplemented.inc
@@ -0,0 +1,8 @@
+// On platforms that have not implemented this yet, we provide an implementation
+// that does not claim support for any features by leaving
+// __aarch64_cpu_features.features initialized to 0.
+
+void __init_cpu_features_resolver(unsigned long hwcap,
+                                  const __ifunc_arg_t *arg) {}
+
+void __init_cpu_features(void) {}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc
new file mode 100644
index 0000000..41aba82
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc
@@ -0,0 +1,189 @@
+#if __has_include(<sys/hwcap.h>)
+#include <sys/hwcap.h>
+#define HAVE_SYS_HWCAP_H
+#endif
+
+#ifndef _IFUNC_ARG_HWCAP
+#define _IFUNC_ARG_HWCAP (1ULL << 62)
+#endif
+#ifndef AT_HWCAP
+#define AT_HWCAP 16
+#endif
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif
+#ifndef HWCAP_FP
+#define HWCAP_FP (1 << 0)
+#endif
+#ifndef HWCAP_ASIMD
+#define HWCAP_ASIMD (1 << 1)
+#endif
+#ifndef HWCAP_AES
+#define HWCAP_AES (1 << 3)
+#endif
+#ifndef HWCAP_PMULL
+#define HWCAP_PMULL (1 << 4)
+#endif
+#ifndef HWCAP_SHA1
+#define HWCAP_SHA1 (1 << 5)
+#endif
+#ifndef HWCAP_SHA2
+#define HWCAP_SHA2 (1 << 6)
+#endif
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+#ifndef HWCAP_ATOMICS
+#define HWCAP_ATOMICS (1 << 8)
+#endif
+#ifndef HWCAP_FPHP
+#define HWCAP_FPHP (1 << 9)
+#endif
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP (1 << 10)
+#endif
+#ifndef HWCAP_ASIMDRDM
+#define HWCAP_ASIMDRDM (1 << 12)
+#endif
+#ifndef HWCAP_JSCVT
+#define HWCAP_JSCVT (1 << 13)
+#endif
+#ifndef HWCAP_FCMA
+#define HWCAP_FCMA (1 << 14)
+#endif
+#ifndef HWCAP_LRCPC
+#define HWCAP_LRCPC (1 << 15)
+#endif
+#ifndef HWCAP_DCPOP
+#define HWCAP_DCPOP (1 << 16)
+#endif
+#ifndef HWCAP_SHA3
+#define HWCAP_SHA3 (1 << 17)
+#endif
+#ifndef HWCAP_SM3
+#define HWCAP_SM3 (1 << 18)
+#endif
+#ifndef HWCAP_SM4
+#define HWCAP_SM4 (1 << 19)
+#endif
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP (1 << 20)
+#endif
+#ifndef HWCAP_SHA512
+#define HWCAP_SHA512 (1 << 21)
+#endif
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
+#ifndef HWCAP_ASIMDFHM
+#define HWCAP_ASIMDFHM (1 << 23)
+#endif
+#ifndef HWCAP_DIT
+#define HWCAP_DIT (1 << 24)
+#endif
+#ifndef HWCAP_ILRCPC
+#define HWCAP_ILRCPC (1 << 26)
+#endif
+#ifndef HWCAP_FLAGM
+#define HWCAP_FLAGM (1 << 27)
+#endif
+#ifndef HWCAP_SSBS
+#define HWCAP_SSBS (1 << 28)
+#endif
+#ifndef HWCAP_SB
+#define HWCAP_SB (1 << 29)
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+#ifndef HWCAP2_DCPODP
+#define HWCAP2_DCPODP (1 << 0)
+#endif
+#ifndef HWCAP2_SVE2
+#define HWCAP2_SVE2 (1 << 1)
+#endif
+#ifndef HWCAP2_SVEAES
+#define HWCAP2_SVEAES (1 << 2)
+#endif
+#ifndef HWCAP2_SVEPMULL
+#define HWCAP2_SVEPMULL (1 << 3)
+#endif
+#ifndef HWCAP2_SVEBITPERM
+#define HWCAP2_SVEBITPERM (1 << 4)
+#endif
+#ifndef HWCAP2_SVESHA3
+#define HWCAP2_SVESHA3 (1 << 5)
+#endif
+#ifndef HWCAP2_SVESM4
+#define HWCAP2_SVESM4 (1 << 6)
+#endif
+#ifndef HWCAP2_FLAGM2
+#define HWCAP2_FLAGM2 (1 << 7)
+#endif
+#ifndef HWCAP2_FRINT
+#define HWCAP2_FRINT (1 << 8)
+#endif
+#ifndef HWCAP2_SVEI8MM
+#define HWCAP2_SVEI8MM (1 << 9)
+#endif
+#ifndef HWCAP2_SVEF32MM
+#define HWCAP2_SVEF32MM (1 << 10)
+#endif
+#ifndef HWCAP2_SVEF64MM
+#define HWCAP2_SVEF64MM (1 << 11)
+#endif
+#ifndef HWCAP2_SVEBF16
+#define HWCAP2_SVEBF16 (1 << 12)
+#endif
+#ifndef HWCAP2_I8MM
+#define HWCAP2_I8MM (1 << 13)
+#endif
+#ifndef HWCAP2_BF16
+#define HWCAP2_BF16 (1 << 14)
+#endif
+#ifndef HWCAP2_DGH
+#define HWCAP2_DGH (1 << 15)
+#endif
+#ifndef HWCAP2_RNG
+#define HWCAP2_RNG (1 << 16)
+#endif
+#ifndef HWCAP2_BTI
+#define HWCAP2_BTI (1 << 17)
+#endif
+#ifndef HWCAP2_MTE
+#define HWCAP2_MTE (1 << 18)
+#endif
+#ifndef HWCAP2_RPRES
+#define HWCAP2_RPRES (1 << 21)
+#endif
+#ifndef HWCAP2_MTE3
+#define HWCAP2_MTE3 (1 << 22)
+#endif
+#ifndef HWCAP2_SME
+#define HWCAP2_SME (1 << 23)
+#endif
+#ifndef HWCAP2_SME_I16I64
+#define HWCAP2_SME_I16I64 (1 << 24)
+#endif
+#ifndef HWCAP2_SME_F64F64
+#define HWCAP2_SME_F64F64 (1 << 25)
+#endif
+#ifndef HWCAP2_WFXT
+#define HWCAP2_WFXT (1UL << 31)
+#endif
+#ifndef HWCAP2_EBF16
+#define HWCAP2_EBF16 (1ULL << 32)
+#endif
+#ifndef HWCAP2_SVE_EBF16
+#define HWCAP2_SVE_EBF16 (1ULL << 33)
+#endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_MOPS
+#define HWCAP2_MOPS (1ULL << 43)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3 (1UL << 46)
+#endif
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc
new file mode 100644
index 0000000..94bf64a
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc
@@ -0,0 +1,28 @@
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/system_properties.h>
+
+static bool __isExynos9810(void) {
+  char arch[PROP_VALUE_MAX];
+  return __system_property_get("ro.arch", arch) > 0 &&
+    strncmp(arch, "exynos9810", sizeof("exynos9810") - 1) == 0;
+}
+
+static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) {
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  _Bool result = (hwcap & HWCAP_ATOMICS) != 0;
+  if (result) {
+    // Some cores in the Exynos 9810 CPU are ARMv8.2 and others are ARMv8.0;
+    // only the former support LSE atomics.  However, the kernel in the
+    // initial Android 8.0 release of Galaxy S9/S9+ devices incorrectly
+    // reported the feature as being supported.
+    //
+    // The kernel appears to have been corrected to mark it unsupported as of
+    // the Android 9.0 release on those devices, and this issue has not been
+    // observed anywhere else. Thus, this workaround may be removed if
+    // compiler-rt ever drops support for Android 8.0.
+    if (__isExynos9810())
+      result = false;
+  }
+  __aarch64_have_lse_atomics = result;
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/freebsd.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/freebsd.inc
new file mode 100644
index 0000000..4a1f9c2
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/freebsd.inc
@@ -0,0 +1,5 @@
+static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) {
+  unsigned long hwcap;
+  int result = elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
+  __aarch64_have_lse_atomics = result == 0 && (hwcap & HWCAP_ATOMICS) != 0;
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/fuchsia.inc
new file mode 100644
index 0000000..91eac70
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/fuchsia.inc
@@ -0,0 +1,12 @@
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) {
+  // This ensures the vDSO is a direct link-time dependency of anything that
+  // needs this initializer code.
+#pragma comment(lib, "zircon")
+  uint32_t features;
+  zx_status_t status = _zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  __aarch64_have_lse_atomics =
+      status == ZX_OK && (features & ZX_ARM64_FEATURE_ISA_ATOMICS) != 0;
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc
new file mode 100644
index 0000000..6642c1f
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc
@@ -0,0 +1,6 @@
+#include <sys/auxv.h>
+
+static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) {
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  __aarch64_have_lse_atomics = (hwcap & HWCAP_ATOMICS) != 0;
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/cpu_model.h b/compiler-rt/lib/builtins/cpu_model/cpu_model.h
new file mode 100644
index 0000000..924ca89
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/cpu_model.h
@@ -0,0 +1,41 @@
+//===-- cpu_model_common.c - Utilities for cpu model detection ----*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements common utilities for runtime cpu model detection.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef COMPILER_RT_LIB_BUILTINS_CPU_MODEL_COMMON_H
+#define COMPILER_RT_LIB_BUILTINS_CPU_MODEL_COMMON_H
+
+#define bool int
+#define true 1
+#define false 0
+
+#ifndef __has_attribute
+#define __has_attribute(attr) 0
+#endif
+
+#if __has_attribute(constructor)
+#if __GNUC__ >= 9
+// Ordinarily init priorities below 101 are disallowed as they are reserved for
+// the implementation. However, we are the implementation, so silence the
+// diagnostic, since it doesn't apply to us.
+#pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
+#endif
+// We're choosing init priority 90 to force our constructors to run before any
+// constructors in the end user application (starting at priority 101). This
+// value matches the libgcc choice for the same functions.
+#define CONSTRUCTOR_ATTRIBUTE __attribute__((constructor(90)))
+#else
+// FIXME: For MSVC, we should make a function pointer global in .CRT$X?? so that
+// this runs during initialization.
+#define CONSTRUCTOR_ATTRIBUTE
+#endif
+
+#endif
diff --git a/compiler-rt/lib/builtins/cpu_model/riscv.c b/compiler-rt/lib/builtins/cpu_model/riscv.c
new file mode 100644
index 0000000..987812c
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/riscv.c
@@ -0,0 +1,370 @@
+//=== cpu_model/riscv.c - Update RISC-V Feature Bits Structure -*- C -*-======//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "cpu_model.h"
+
+#define RISCV_FEATURE_BITS_LENGTH 2
+struct {
+  unsigned length;
+  unsigned long long features[RISCV_FEATURE_BITS_LENGTH];
+} __riscv_feature_bits __attribute__((visibility("hidden"), nocommon));
+
+#define RISCV_VENDOR_FEATURE_BITS_LENGTH 1
+struct {
+  unsigned length;
+  unsigned long long features[RISCV_VENDOR_FEATURE_BITS_LENGTH];
+} __riscv_vendor_feature_bits __attribute__((visibility("hidden"), nocommon));
+
+struct {
+  unsigned mVendorID;
+  unsigned mArchID;
+  unsigned mImplID;
+} __riscv_cpu_model __attribute__((visibility("hidden"), nocommon));
+
+// NOTE: Should sync-up with RISCVFeatures.td
+// TODO: Maybe generate a header from tablegen then include it.
+#define A_GROUPID 0
+#define A_BITMASK (1ULL << 0)
+#define C_GROUPID 0
+#define C_BITMASK (1ULL << 2)
+#define D_GROUPID 0
+#define D_BITMASK (1ULL << 3)
+#define F_GROUPID 0
+#define F_BITMASK (1ULL << 5)
+#define I_GROUPID 0
+#define I_BITMASK (1ULL << 8)
+#define M_GROUPID 0
+#define M_BITMASK (1ULL << 12)
+#define V_GROUPID 0
+#define V_BITMASK (1ULL << 21)
+#define ZACAS_GROUPID 0
+#define ZACAS_BITMASK (1ULL << 26)
+#define ZBA_GROUPID 0
+#define ZBA_BITMASK (1ULL << 27)
+#define ZBB_GROUPID 0
+#define ZBB_BITMASK (1ULL << 28)
+#define ZBC_GROUPID 0
+#define ZBC_BITMASK (1ULL << 29)
+#define ZBKB_GROUPID 0
+#define ZBKB_BITMASK (1ULL << 30)
+#define ZBKC_GROUPID 0
+#define ZBKC_BITMASK (1ULL << 31)
+#define ZBKX_GROUPID 0
+#define ZBKX_BITMASK (1ULL << 32)
+#define ZBS_GROUPID 0
+#define ZBS_BITMASK (1ULL << 33)
+#define ZFA_GROUPID 0
+#define ZFA_BITMASK (1ULL << 34)
+#define ZFH_GROUPID 0
+#define ZFH_BITMASK (1ULL << 35)
+#define ZFHMIN_GROUPID 0
+#define ZFHMIN_BITMASK (1ULL << 36)
+#define ZICBOZ_GROUPID 0
+#define ZICBOZ_BITMASK (1ULL << 37)
+#define ZICOND_GROUPID 0
+#define ZICOND_BITMASK (1ULL << 38)
+#define ZIHINTNTL_GROUPID 0
+#define ZIHINTNTL_BITMASK (1ULL << 39)
+#define ZIHINTPAUSE_GROUPID 0
+#define ZIHINTPAUSE_BITMASK (1ULL << 40)
+#define ZKND_GROUPID 0
+#define ZKND_BITMASK (1ULL << 41)
+#define ZKNE_GROUPID 0
+#define ZKNE_BITMASK (1ULL << 42)
+#define ZKNH_GROUPID 0
+#define ZKNH_BITMASK (1ULL << 43)
+#define ZKSED_GROUPID 0
+#define ZKSED_BITMASK (1ULL << 44)
+#define ZKSH_GROUPID 0
+#define ZKSH_BITMASK (1ULL << 45)
+#define ZKT_GROUPID 0
+#define ZKT_BITMASK (1ULL << 46)
+#define ZTSO_GROUPID 0
+#define ZTSO_BITMASK (1ULL << 47)
+#define ZVBB_GROUPID 0
+#define ZVBB_BITMASK (1ULL << 48)
+#define ZVBC_GROUPID 0
+#define ZVBC_BITMASK (1ULL << 49)
+#define ZVFH_GROUPID 0
+#define ZVFH_BITMASK (1ULL << 50)
+#define ZVFHMIN_GROUPID 0
+#define ZVFHMIN_BITMASK (1ULL << 51)
+#define ZVKB_GROUPID 0
+#define ZVKB_BITMASK (1ULL << 52)
+#define ZVKG_GROUPID 0
+#define ZVKG_BITMASK (1ULL << 53)
+#define ZVKNED_GROUPID 0
+#define ZVKNED_BITMASK (1ULL << 54)
+#define ZVKNHA_GROUPID 0
+#define ZVKNHA_BITMASK (1ULL << 55)
+#define ZVKNHB_GROUPID 0
+#define ZVKNHB_BITMASK (1ULL << 56)
+#define ZVKSED_GROUPID 0
+#define ZVKSED_BITMASK (1ULL << 57)
+#define ZVKSH_GROUPID 0
+#define ZVKSH_BITMASK (1ULL << 58)
+#define ZVKT_GROUPID 0
+#define ZVKT_BITMASK (1ULL << 59)
+#define ZVE32X_GROUPID 0
+#define ZVE32X_BITMASK (1ULL << 60)
+#define ZVE32F_GROUPID 0
+#define ZVE32F_BITMASK (1ULL << 61)
+#define ZVE64X_GROUPID 0
+#define ZVE64X_BITMASK (1ULL << 62)
+#define ZVE64F_GROUPID 0
+#define ZVE64F_BITMASK (1ULL << 63)
+#define ZVE64D_GROUPID 1
+#define ZVE64D_BITMASK (1ULL << 0)
+#define ZIMOP_GROUPID 1
+#define ZIMOP_BITMASK (1ULL << 1)
+#define ZCA_GROUPID 1
+#define ZCA_BITMASK (1ULL << 2)
+#define ZCB_GROUPID 1
+#define ZCB_BITMASK (1ULL << 3)
+#define ZCD_GROUPID 1
+#define ZCD_BITMASK (1ULL << 4)
+#define ZCF_GROUPID 1
+#define ZCF_BITMASK (1ULL << 5)
+#define ZCMOP_GROUPID 1
+#define ZCMOP_BITMASK (1ULL << 6)
+#define ZAWRS_GROUPID 1
+#define ZAWRS_BITMASK (1ULL << 7)
+
+#if defined(__linux__)
+
+// The RISC-V hwprobe interface is documented here:
+// <https://docs.kernel.org/arch/riscv/hwprobe.html>.
+
+static long syscall_impl_5_args(long number, long arg1, long arg2, long arg3,
+                                long arg4, long arg5) {
+  register long a7 __asm__("a7") = number;
+  register long a0 __asm__("a0") = arg1;
+  register long a1 __asm__("a1") = arg2;
+  register long a2 __asm__("a2") = arg3;
+  register long a3 __asm__("a3") = arg4;
+  register long a4 __asm__("a4") = arg5;
+  __asm__ __volatile__("ecall\n\t"
+                       : "=r"(a0)
+                       : "r"(a7), "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4)
+                       : "memory");
+  return a0;
+}
+
+#define RISCV_HWPROBE_KEY_MVENDORID 0
+#define RISCV_HWPROBE_KEY_MARCHID 1
+#define RISCV_HWPROBE_KEY_MIMPID 2
+#define RISCV_HWPROBE_KEY_BASE_BEHAVIOR 3
+#define RISCV_HWPROBE_BASE_BEHAVIOR_IMA (1ULL << 0)
+#define RISCV_HWPROBE_KEY_IMA_EXT_0 4
+#define RISCV_HWPROBE_IMA_FD (1ULL << 0)
+#define RISCV_HWPROBE_IMA_C (1ULL << 1)
+#define RISCV_HWPROBE_IMA_V (1ULL << 2)
+#define RISCV_HWPROBE_EXT_ZBA (1ULL << 3)
+#define RISCV_HWPROBE_EXT_ZBB (1ULL << 4)
+#define RISCV_HWPROBE_EXT_ZBS (1ULL << 5)
+#define RISCV_HWPROBE_EXT_ZICBOZ (1ULL << 6)
+#define RISCV_HWPROBE_EXT_ZBC (1ULL << 7)
+#define RISCV_HWPROBE_EXT_ZBKB (1ULL << 8)
+#define RISCV_HWPROBE_EXT_ZBKC (1ULL << 9)
+#define RISCV_HWPROBE_EXT_ZBKX (1ULL << 10)
+#define RISCV_HWPROBE_EXT_ZKND (1ULL << 11)
+#define RISCV_HWPROBE_EXT_ZKNE (1ULL << 12)
+#define RISCV_HWPROBE_EXT_ZKNH (1ULL << 13)
+#define RISCV_HWPROBE_EXT_ZKSED (1ULL << 14)
+#define RISCV_HWPROBE_EXT_ZKSH (1ULL << 15)
+#define RISCV_HWPROBE_EXT_ZKT (1ULL << 16)
+#define RISCV_HWPROBE_EXT_ZVBB (1ULL << 17)
+#define RISCV_HWPROBE_EXT_ZVBC (1ULL << 18)
+#define RISCV_HWPROBE_EXT_ZVKB (1ULL << 19)
+#define RISCV_HWPROBE_EXT_ZVKG (1ULL << 20)
+#define RISCV_HWPROBE_EXT_ZVKNED (1ULL << 21)
+#define RISCV_HWPROBE_EXT_ZVKNHA (1ULL << 22)
+#define RISCV_HWPROBE_EXT_ZVKNHB (1ULL << 23)
+#define RISCV_HWPROBE_EXT_ZVKSED (1ULL << 24)
+#define RISCV_HWPROBE_EXT_ZVKSH (1ULL << 25)
+#define RISCV_HWPROBE_EXT_ZVKT (1ULL << 26)
+#define RISCV_HWPROBE_EXT_ZFH (1ULL << 27)
+#define RISCV_HWPROBE_EXT_ZFHMIN (1ULL << 28)
+#define RISCV_HWPROBE_EXT_ZIHINTNTL (1ULL << 29)
+#define RISCV_HWPROBE_EXT_ZVFH (1ULL << 30)
+#define RISCV_HWPROBE_EXT_ZVFHMIN (1ULL << 31)
+#define RISCV_HWPROBE_EXT_ZFA (1ULL << 32)
+#define RISCV_HWPROBE_EXT_ZTSO (1ULL << 33)
+#define RISCV_HWPROBE_EXT_ZACAS (1ULL << 34)
+#define RISCV_HWPROBE_EXT_ZICOND (1ULL << 35)
+#define RISCV_HWPROBE_EXT_ZIHINTPAUSE (1ULL << 36)
+#define RISCV_HWPROBE_EXT_ZVE32X (1ULL << 37)
+#define RISCV_HWPROBE_EXT_ZVE32F (1ULL << 38)
+#define RISCV_HWPROBE_EXT_ZVE64X (1ULL << 39)
+#define RISCV_HWPROBE_EXT_ZVE64F (1ULL << 40)
+#define RISCV_HWPROBE_EXT_ZVE64D (1ULL << 41)
+#define RISCV_HWPROBE_EXT_ZIMOP (1ULL << 42)
+#define RISCV_HWPROBE_EXT_ZCA (1ULL << 43)
+#define RISCV_HWPROBE_EXT_ZCB (1ULL << 44)
+#define RISCV_HWPROBE_EXT_ZCD (1ULL << 45)
+#define RISCV_HWPROBE_EXT_ZCF (1ULL << 46)
+#define RISCV_HWPROBE_EXT_ZCMOP (1ULL << 47)
+#define RISCV_HWPROBE_EXT_ZAWRS (1ULL << 48)
+#define RISCV_HWPROBE_KEY_CPUPERF_0 5
+#define RISCV_HWPROBE_MISALIGNED_UNKNOWN (0 << 0)
+#define RISCV_HWPROBE_MISALIGNED_EMULATED (1ULL << 0)
+#define RISCV_HWPROBE_MISALIGNED_SLOW (2 << 0)
+#define RISCV_HWPROBE_MISALIGNED_FAST (3 << 0)
+#define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0)
+#define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0)
+#define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE 6
+/* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
+
+struct riscv_hwprobe {
+  long long key;
+  unsigned long long value;
+};
+
+#define __NR_riscv_hwprobe 258
+static long initHwProbe(struct riscv_hwprobe *Hwprobes, int len) {
+  return syscall_impl_5_args(__NR_riscv_hwprobe, (long)Hwprobes, len, 0, 0, 0);
+}
+
+#define SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(EXTNAME)                    \
+  SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_EXT_##EXTNAME, EXTNAME)
+
+#define SET_SINGLE_IMAEXT_RISCV_FEATURE(HWPROBE_BITMASK, EXT)                  \
+  SET_SINGLE_RISCV_FEATURE(IMAEXT0Value &HWPROBE_BITMASK, EXT)
+
+#define SET_SINGLE_RISCV_FEATURE(COND, EXT)                                    \
+  if (COND) {                                                                  \
+    SET_RISCV_FEATURE(EXT);                                                    \
+  }
+
+#define SET_RISCV_FEATURE(EXT) features[EXT##_GROUPID] |= EXT##_BITMASK
+
+static void initRISCVFeature(struct riscv_hwprobe Hwprobes[]) {
+
+  // Note: If a hwprobe key is unknown to the kernel, its key field
+  // will be cleared to -1, and its value set to 0.
+  // This unsets all extension bitmask bits.
+
+  // Init VendorID, ArchID, ImplID
+  __riscv_cpu_model.mVendorID = Hwprobes[2].value;
+  __riscv_cpu_model.mArchID = Hwprobes[3].value;
+  __riscv_cpu_model.mImplID = Hwprobes[4].value;
+
+  // Init standard extension
+  // TODO: Maybe Extension implied generate from tablegen?
+
+  unsigned long long features[RISCV_FEATURE_BITS_LENGTH];
+  int i;
+
+  for (i = 0; i < RISCV_FEATURE_BITS_LENGTH; i++)
+    features[i] = 0;
+
+  // Check RISCV_HWPROBE_KEY_BASE_BEHAVIOR
+  unsigned long long BaseValue = Hwprobes[0].value;
+  if (BaseValue & RISCV_HWPROBE_BASE_BEHAVIOR_IMA) {
+    SET_RISCV_FEATURE(I);
+    SET_RISCV_FEATURE(M);
+    SET_RISCV_FEATURE(A);
+  }
+
+  // Check RISCV_HWPROBE_KEY_IMA_EXT_0
+  unsigned long long IMAEXT0Value = Hwprobes[1].value;
+  if (IMAEXT0Value & RISCV_HWPROBE_IMA_FD) {
+    SET_RISCV_FEATURE(F);
+    SET_RISCV_FEATURE(D);
+  }
+
+  SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_IMA_C, C);
+  SET_SINGLE_IMAEXT_RISCV_FEATURE(RISCV_HWPROBE_IMA_V, V);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBA);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBS);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZICBOZ);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBC);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKC);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZBKX);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKND);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKNE);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKNH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKSED);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKSH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZKT);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVBB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVBC);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKG);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNED);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNHA);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKNHB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKSED);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKSH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVKT);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFHMIN);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZIHINTNTL);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZIHINTPAUSE);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVFH);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVFHMIN);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZFA);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZTSO);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZACAS);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZICOND);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE32X);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE32F);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE64X);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE64F);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZVE64D);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZIMOP);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCA);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCB);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCD);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCF);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZCMOP);
+  SET_RISCV_HWPROBE_EXT_SINGLE_RISCV_FEATURE(ZAWRS);
+
+  for (i = 0; i < RISCV_FEATURE_BITS_LENGTH; i++)
+    __riscv_feature_bits.features[i] = features[i];
+}
+
+#endif // defined(__linux__)
+
+static int FeaturesBitCached = 0;
+
+void __init_riscv_feature_bits(void *) CONSTRUCTOR_ATTRIBUTE;
+
+// A constructor function that sets __riscv_feature_bits, and
+// __riscv_vendor_feature_bits to the right values.  This needs to run
+// only once.  This constructor is given the highest priority and it should
+// run before constructors without the priority set.  However, it still runs
+// after ifunc initializers and needs to be called explicitly there.
+
+// PlatformArgs allows the platform to provide pre-computed data and access it
+// without extra effort. For example, Linux could pass the vDSO object to avoid
+// an extra system call.
+void CONSTRUCTOR_ATTRIBUTE __init_riscv_feature_bits(void *PlatformArgs) {
+
+  if (FeaturesBitCached)
+    return;
+
+  __riscv_feature_bits.length = RISCV_FEATURE_BITS_LENGTH;
+  __riscv_vendor_feature_bits.length = RISCV_VENDOR_FEATURE_BITS_LENGTH;
+
+#if defined(__linux__)
+  struct riscv_hwprobe Hwprobes[] = {
+      {RISCV_HWPROBE_KEY_BASE_BEHAVIOR, 0}, {RISCV_HWPROBE_KEY_IMA_EXT_0, 0},
+      {RISCV_HWPROBE_KEY_MVENDORID, 0},     {RISCV_HWPROBE_KEY_MARCHID, 0},
+      {RISCV_HWPROBE_KEY_MIMPID, 0},
+  };
+  if (initHwProbe(Hwprobes, sizeof(Hwprobes) / sizeof(Hwprobes[0])))
+    return;
+
+  initRISCVFeature(Hwprobes);
+#endif // defined(__linux__)
+
+  FeaturesBitCached = 1;
+}
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
new file mode 100644
index 0000000..069defc
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -0,0 +1,1139 @@
+//===-- cpu_model/x86.c - Support for __cpu_model builtin  --------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file is based on LLVM's lib/Support/Host.cpp.
+//  It implements the operating system Host concept and builtin
+//  __cpu_model for the compiler_rt library for x86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "cpu_model.h"
+
+#if !(defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) ||          \
+      defined(_M_X64))
+#error This file is intended only for x86-based targets
+#endif
+
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
+
+#include <assert.h>
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
+#include <cpuid.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+enum VendorSignatures {
+  SIG_INTEL = 0x756e6547, // Genu
+  SIG_AMD = 0x68747541,   // Auth
+};
+
+enum ProcessorVendors {
+  VENDOR_INTEL = 1,
+  VENDOR_AMD,
+  VENDOR_OTHER,
+  VENDOR_MAX
+};
+
+enum ProcessorTypes {
+  INTEL_BONNELL = 1,
+  INTEL_CORE2,
+  INTEL_COREI7,
+  AMDFAM10H,
+  AMDFAM15H,
+  INTEL_SILVERMONT,
+  INTEL_KNL,
+  AMD_BTVER1,
+  AMD_BTVER2,
+  AMDFAM17H,
+  INTEL_KNM,
+  INTEL_GOLDMONT,
+  INTEL_GOLDMONT_PLUS,
+  INTEL_TREMONT,
+  AMDFAM19H,
+  ZHAOXIN_FAM7H,
+  INTEL_SIERRAFOREST,
+  INTEL_GRANDRIDGE,
+  INTEL_CLEARWATERFOREST,
+  CPU_TYPE_MAX
+};
+
+enum ProcessorSubtypes {
+  INTEL_COREI7_NEHALEM = 1,
+  INTEL_COREI7_WESTMERE,
+  INTEL_COREI7_SANDYBRIDGE,
+  AMDFAM10H_BARCELONA,
+  AMDFAM10H_SHANGHAI,
+  AMDFAM10H_ISTANBUL,
+  AMDFAM15H_BDVER1,
+  AMDFAM15H_BDVER2,
+  AMDFAM15H_BDVER3,
+  AMDFAM15H_BDVER4,
+  AMDFAM17H_ZNVER1,
+  INTEL_COREI7_IVYBRIDGE,
+  INTEL_COREI7_HASWELL,
+  INTEL_COREI7_BROADWELL,
+  INTEL_COREI7_SKYLAKE,
+  INTEL_COREI7_SKYLAKE_AVX512,
+  INTEL_COREI7_CANNONLAKE,
+  INTEL_COREI7_ICELAKE_CLIENT,
+  INTEL_COREI7_ICELAKE_SERVER,
+  AMDFAM17H_ZNVER2,
+  INTEL_COREI7_CASCADELAKE,
+  INTEL_COREI7_TIGERLAKE,
+  INTEL_COREI7_COOPERLAKE,
+  INTEL_COREI7_SAPPHIRERAPIDS,
+  INTEL_COREI7_ALDERLAKE,
+  AMDFAM19H_ZNVER3,
+  INTEL_COREI7_ROCKETLAKE,
+  ZHAOXIN_FAM7H_LUJIAZUI,
+  AMDFAM19H_ZNVER4,
+  INTEL_COREI7_GRANITERAPIDS,
+  INTEL_COREI7_GRANITERAPIDS_D,
+  INTEL_COREI7_ARROWLAKE,
+  INTEL_COREI7_ARROWLAKE_S,
+  INTEL_COREI7_PANTHERLAKE,
+  CPU_SUBTYPE_MAX
+};
+
+enum ProcessorFeatures {
+  FEATURE_CMOV = 0,
+  FEATURE_MMX,
+  FEATURE_POPCNT,
+  FEATURE_SSE,
+  FEATURE_SSE2,
+  FEATURE_SSE3,
+  FEATURE_SSSE3,
+  FEATURE_SSE4_1,
+  FEATURE_SSE4_2,
+  FEATURE_AVX,
+  FEATURE_AVX2,
+  FEATURE_SSE4_A,
+  FEATURE_FMA4,
+  FEATURE_XOP,
+  FEATURE_FMA,
+  FEATURE_AVX512F,
+  FEATURE_BMI,
+  FEATURE_BMI2,
+  FEATURE_AES,
+  FEATURE_PCLMUL,
+  FEATURE_AVX512VL,
+  FEATURE_AVX512BW,
+  FEATURE_AVX512DQ,
+  FEATURE_AVX512CD,
+  FEATURE_AVX512ER,
+  FEATURE_AVX512PF,
+  FEATURE_AVX512VBMI,
+  FEATURE_AVX512IFMA,
+  FEATURE_AVX5124VNNIW,
+  FEATURE_AVX5124FMAPS,
+  FEATURE_AVX512VPOPCNTDQ,
+  FEATURE_AVX512VBMI2,
+  FEATURE_GFNI,
+  FEATURE_VPCLMULQDQ,
+  FEATURE_AVX512VNNI,
+  FEATURE_AVX512BITALG,
+  FEATURE_AVX512BF16,
+  FEATURE_AVX512VP2INTERSECT,
+  // FIXME: Below Features has some missings comparing to gcc, it's because gcc
+  // has some not one-to-one mapped in llvm.
+  // FEATURE_3DNOW,
+  // FEATURE_3DNOWP,
+  FEATURE_ADX = 40,
+  // FEATURE_ABM,
+  FEATURE_CLDEMOTE = 42,
+  FEATURE_CLFLUSHOPT,
+  FEATURE_CLWB,
+  FEATURE_CLZERO,
+  FEATURE_CMPXCHG16B,
+  // FIXME: Not adding FEATURE_CMPXCHG8B is a workaround to make 'generic' as
+  // a cpu string with no X86_FEATURE_COMPAT features, which is required in
+  // current implementantion of cpu_specific/cpu_dispatch FMV feature.
+  // FEATURE_CMPXCHG8B,
+  FEATURE_ENQCMD = 48,
+  FEATURE_F16C,
+  FEATURE_FSGSBASE,
+  // FEATURE_FXSAVE,
+  // FEATURE_HLE,
+  // FEATURE_IBT,
+  FEATURE_LAHF_LM = 54,
+  FEATURE_LM,
+  FEATURE_LWP,
+  FEATURE_LZCNT,
+  FEATURE_MOVBE,
+  FEATURE_MOVDIR64B,
+  FEATURE_MOVDIRI,
+  FEATURE_MWAITX,
+  // FEATURE_OSXSAVE,
+  FEATURE_PCONFIG = 63,
+  FEATURE_PKU,
+  FEATURE_PREFETCHWT1,
+  FEATURE_PRFCHW,
+  FEATURE_PTWRITE,
+  FEATURE_RDPID,
+  FEATURE_RDRND,
+  FEATURE_RDSEED,
+  FEATURE_RTM,
+  FEATURE_SERIALIZE,
+  FEATURE_SGX,
+  FEATURE_SHA,
+  FEATURE_SHSTK,
+  FEATURE_TBM,
+  FEATURE_TSXLDTRK,
+  FEATURE_VAES,
+  FEATURE_WAITPKG,
+  FEATURE_WBNOINVD,
+  FEATURE_XSAVE,
+  FEATURE_XSAVEC,
+  FEATURE_XSAVEOPT,
+  FEATURE_XSAVES,
+  FEATURE_AMX_TILE,
+  FEATURE_AMX_INT8,
+  FEATURE_AMX_BF16,
+  FEATURE_UINTR,
+  FEATURE_HRESET,
+  FEATURE_KL,
+  // FEATURE_AESKLE,
+  FEATURE_WIDEKL = 92,
+  FEATURE_AVXVNNI,
+  FEATURE_AVX512FP16,
+  FEATURE_X86_64_BASELINE,
+  FEATURE_X86_64_V2,
+  FEATURE_X86_64_V3,
+  FEATURE_X86_64_V4,
+  FEATURE_AVXIFMA,
+  FEATURE_AVXVNNIINT8,
+  FEATURE_AVXNECONVERT,
+  FEATURE_CMPCCXADD,
+  FEATURE_AMX_FP16,
+  FEATURE_PREFETCHI,
+  FEATURE_RAOINT,
+  FEATURE_AMX_COMPLEX,
+  FEATURE_AVXVNNIINT16,
+  FEATURE_SM3,
+  FEATURE_SHA512,
+  FEATURE_SM4,
+  FEATURE_APXF,
+  FEATURE_USERMSR,
+  FEATURE_AVX10_1_256,
+  FEATURE_AVX10_1_512,
+  FEATURE_AVX10_2_256,
+  FEATURE_AVX10_2_512,
+  CPU_FEATURE_MAX
+};
+
+// This code is copied from lib/Support/Host.cpp.
+// Changes to either file should be mirrored in the other.
+
+/// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+/// the specified arguments.  If we can't run cpuid on the host, return true.
+static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                               unsigned *rECX, unsigned *rEDX) {
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
+  return !__get_cpuid(value, rEAX, rEBX, rECX, rEDX);
+#elif defined(_MSC_VER)
+  // The MSVC intrinsic is portable across x86 and x64.
+  int registers[4];
+  __cpuid(registers, value);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
+  return false;
+#else
+  return true;
+#endif
+}
+
+/// getX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return
+/// the 4 values in the specified arguments.  If we can't run cpuid on the host,
+/// return true.
+static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
+                                 unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
+                                 unsigned *rEDX) {
+  // TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang
+  // are such that __cpuidex is defined within cpuid.h for both, we can remove
+  // the __get_cpuid_count function and share the MSVC implementation between
+  // all three.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
+  return !__get_cpuid_count(value, subleaf, rEAX, rEBX, rECX, rEDX);
+#elif defined(_MSC_VER)
+  int registers[4];
+  __cpuidex(registers, value, subleaf);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
+  return false;
+#else
+  return true;
+#endif
+}
+
+// Read control register 0 (XCR0). Used to detect features such as AVX.
+static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) {
+  // TODO(boomanaiden154): When the minimum toolchain versions for gcc and clang
+  // are such that _xgetbv is supported by both, we can unify the implementation
+  // with MSVC and remove all inline assembly.
+#if defined(__GNUC__) || defined(__clang__)
+  // Check xgetbv; this uses a .byte sequence instead of the instruction
+  // directly because older assemblers do not include support for xgetbv and
+  // there is no easy way to conditionally compile based on the assembler used.
+  __asm__(".byte 0x0f, 0x01, 0xd0" : "=a"(*rEAX), "=d"(*rEDX) : "c"(0));
+  return false;
+#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
+  unsigned long long Result = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+  *rEAX = Result;
+  *rEDX = Result >> 32;
+  return false;
+#else
+  return true;
+#endif
+}
+
+static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
+                                 unsigned *Model) {
+  *Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+  *Model = (EAX >> 4) & 0xf;  // Bits 4 - 7
+  if (*Family == 6 || *Family == 0xf) {
+    if (*Family == 0xf)
+      // Examine extended family ID if family ID is F.
+      *Family += (EAX >> 20) & 0xff; // Bits 20 - 27
+    // Examine extended model ID if family ID is 6 or F.
+    *Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+  }
+}
+
+#define testFeature(F) (Features[F / 32] & (1 << (F % 32))) != 0
+
+static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
+                                                   unsigned Model,
+                                                   const unsigned *Features,
+                                                   unsigned *Type,
+                                                   unsigned *Subtype) {
+  // We select CPU strings to match the code in Host.cpp, but we don't use them
+  // in compiler-rt.
+  const char *CPU = 0;
+
+  switch (Family) {
+  case 6:
+    switch (Model) {
+    case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
+               // processor, Intel Core 2 Quad processor, Intel Core 2 Quad
+               // mobile processor, Intel Core 2 Extreme processor, Intel
+               // Pentium Dual-Core processor, Intel Xeon processor, model
+               // 0Fh. All processors are manufactured using the 65 nm process.
+    case 0x16: // Intel Celeron processor model 16h. All processors are
+               // manufactured using the 65 nm process
+      CPU = "core2";
+      *Type = INTEL_CORE2;
+      break;
+    case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model
+               // 17h. All processors are manufactured using the 45 nm process.
+               //
+               // 45nm: Penryn , Wolfdale, Yorkfield (XE)
+    case 0x1d: // Intel Xeon processor MP. All processors are manufactured using
+               // the 45 nm process.
+      CPU = "penryn";
+      *Type = INTEL_CORE2;
+      break;
+    case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All
+               // processors are manufactured using the 45 nm process.
+    case 0x1e: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
+               // As found in a Summer 2010 model iMac.
+    case 0x1f:
+    case 0x2e: // Nehalem EX
+      CPU = "nehalem";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_NEHALEM;
+      break;
+    case 0x25: // Intel Core i7, laptop version.
+    case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All
+               // processors are manufactured using the 32 nm process.
+    case 0x2f: // Westmere EX
+      CPU = "westmere";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_WESTMERE;
+      break;
+    case 0x2a: // Intel Core i7 processor. All processors are manufactured
+               // using the 32 nm process.
+    case 0x2d:
+      CPU = "sandybridge";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_SANDYBRIDGE;
+      break;
+    case 0x3a:
+    case 0x3e: // Ivy Bridge EP
+      CPU = "ivybridge";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_IVYBRIDGE;
+      break;
+
+    // Haswell:
+    case 0x3c:
+    case 0x3f:
+    case 0x45:
+    case 0x46:
+      CPU = "haswell";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_HASWELL;
+      break;
+
+    // Broadwell:
+    case 0x3d:
+    case 0x47:
+    case 0x4f:
+    case 0x56:
+      CPU = "broadwell";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_BROADWELL;
+      break;
+
+    // Skylake:
+    case 0x4e: // Skylake mobile
+    case 0x5e: // Skylake desktop
+    case 0x8e: // Kaby Lake mobile
+    case 0x9e: // Kaby Lake desktop
+    case 0xa5: // Comet Lake-H/S
+    case 0xa6: // Comet Lake-U
+      CPU = "skylake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_SKYLAKE;
+      break;
+
+    // Rocketlake:
+    case 0xa7:
+      CPU = "rocketlake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_ROCKETLAKE;
+      break;
+
+    // Skylake Xeon:
+    case 0x55:
+      *Type = INTEL_COREI7;
+      if (testFeature(FEATURE_AVX512BF16)) {
+        CPU = "cooperlake";
+        *Subtype = INTEL_COREI7_COOPERLAKE;
+      } else if (testFeature(FEATURE_AVX512VNNI)) {
+        CPU = "cascadelake";
+        *Subtype = INTEL_COREI7_CASCADELAKE;
+      } else {
+        CPU = "skylake-avx512";
+        *Subtype = INTEL_COREI7_SKYLAKE_AVX512;
+      }
+      break;
+
+    // Cannonlake:
+    case 0x66:
+      CPU = "cannonlake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_CANNONLAKE;
+      break;
+
+    // Icelake:
+    case 0x7d:
+    case 0x7e:
+      CPU = "icelake-client";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_ICELAKE_CLIENT;
+      break;
+
+    // Tigerlake:
+    case 0x8c:
+    case 0x8d:
+      CPU = "tigerlake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_TIGERLAKE;
+      break;
+
+    // Alderlake:
+    case 0x97:
+    case 0x9a:
+    // Raptorlake:
+    case 0xb7:
+    case 0xba:
+    case 0xbf:
+    // Meteorlake:
+    case 0xaa:
+    case 0xac:
+    // Gracemont:
+    case 0xbe:
+      CPU = "alderlake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_ALDERLAKE;
+      break;
+
+    // Arrowlake:
+    case 0xc5:
+      CPU = "arrowlake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_ARROWLAKE;
+      break;
+
+    // Arrowlake S:
+    case 0xc6:
+    // Lunarlake:
+    case 0xbd:
+      CPU = "arrowlake-s";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_ARROWLAKE_S;
+      break;
+
+    // Pantherlake:
+    case 0xcc:
+      CPU = "pantherlake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_PANTHERLAKE;
+      break;
+
+    // Icelake Xeon:
+    case 0x6a:
+    case 0x6c:
+      CPU = "icelake-server";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_ICELAKE_SERVER;
+      break;
+
+    // Emerald Rapids:
+    case 0xcf:
+    // Sapphire Rapids:
+    case 0x8f:
+      CPU = "sapphirerapids";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_SAPPHIRERAPIDS;
+      break;
+
+    // Granite Rapids:
+    case 0xad:
+      CPU = "graniterapids";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_GRANITERAPIDS;
+      break;
+
+    // Granite Rapids D:
+    case 0xae:
+      CPU = "graniterapids-d";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_GRANITERAPIDS_D;
+      break;
+
+    case 0x1c: // Most 45 nm Intel Atom processors
+    case 0x26: // 45 nm Atom Lincroft
+    case 0x27: // 32 nm Atom Medfield
+    case 0x35: // 32 nm Atom Midview
+    case 0x36: // 32 nm Atom Midview
+      CPU = "bonnell";
+      *Type = INTEL_BONNELL;
+      break;
+
+    // Atom Silvermont codes from the Intel software optimization guide.
+    case 0x37:
+    case 0x4a:
+    case 0x4d:
+    case 0x5a:
+    case 0x5d:
+    case 0x4c: // really airmont
+      CPU = "silvermont";
+      *Type = INTEL_SILVERMONT;
+      break;
+    // Goldmont:
+    case 0x5c: // Apollo Lake
+    case 0x5f: // Denverton
+      CPU = "goldmont";
+      *Type = INTEL_GOLDMONT;
+      break; // "goldmont"
+    case 0x7a:
+      CPU = "goldmont-plus";
+      *Type = INTEL_GOLDMONT_PLUS;
+      break;
+    case 0x86:
+    case 0x8a: // Lakefield
+    case 0x96: // Elkhart Lake
+    case 0x9c: // Jasper Lake
+      CPU = "tremont";
+      *Type = INTEL_TREMONT;
+      break;
+
+    // Sierraforest:
+    case 0xaf:
+      CPU = "sierraforest";
+      *Type = INTEL_SIERRAFOREST;
+      break;
+
+    // Grandridge:
+    case 0xb6:
+      CPU = "grandridge";
+      *Type = INTEL_GRANDRIDGE;
+      break;
+
+    // Clearwaterforest:
+    case 0xdd:
+      CPU = "clearwaterforest";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_CLEARWATERFOREST;
+      break;
+
+    case 0x57:
+      CPU = "knl";
+      *Type = INTEL_KNL;
+      break;
+
+    case 0x85:
+      CPU = "knm";
+      *Type = INTEL_KNM;
+      break;
+
+    default: // Unknown family 6 CPU.
+      break;
+    }
+    break;
+  default:
+    break; // Unknown.
+  }
+
+  return CPU;
+}
+
+static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
+                                                 unsigned Model,
+                                                 const unsigned *Features,
+                                                 unsigned *Type,
+                                                 unsigned *Subtype) {
+  const char *CPU = 0;
+
+  switch (Family) {
+  case 4:
+    CPU = "i486";
+    break;
+  case 5:
+    CPU = "pentium";
+    switch (Model) {
+    case 6:
+    case 7:
+      CPU = "k6";
+      break;
+    case 8:
+      CPU = "k6-2";
+      break;
+    case 9:
+    case 13:
+      CPU = "k6-3";
+      break;
+    case 10:
+      CPU = "geode";
+      break;
+    }
+    break;
+  case 6:
+    if (testFeature(FEATURE_SSE)) {
+      CPU = "athlon-xp";
+      break;
+    }
+    CPU = "athlon";
+    break;
+  case 15:
+    if (testFeature(FEATURE_SSE3)) {
+      CPU = "k8-sse3";
+      break;
+    }
+    CPU = "k8";
+    break;
+  case 16:
+    CPU = "amdfam10";
+    *Type = AMDFAM10H; // "amdfam10"
+    switch (Model) {
+    case 2:
+      *Subtype = AMDFAM10H_BARCELONA;
+      break;
+    case 4:
+      *Subtype = AMDFAM10H_SHANGHAI;
+      break;
+    case 8:
+      *Subtype = AMDFAM10H_ISTANBUL;
+      break;
+    }
+    break;
+  case 20:
+    CPU = "btver1";
+    *Type = AMD_BTVER1;
+    break;
+  case 21:
+    CPU = "bdver1";
+    *Type = AMDFAM15H;
+    if (Model >= 0x60 && Model <= 0x7f) {
+      CPU = "bdver4";
+      *Subtype = AMDFAM15H_BDVER4;
+      break; // 60h-7Fh: Excavator
+    }
+    if (Model >= 0x30 && Model <= 0x3f) {
+      CPU = "bdver3";
+      *Subtype = AMDFAM15H_BDVER3;
+      break; // 30h-3Fh: Steamroller
+    }
+    if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) {
+      CPU = "bdver2";
+      *Subtype = AMDFAM15H_BDVER2;
+      break; // 02h, 10h-1Fh: Piledriver
+    }
+    if (Model <= 0x0f) {
+      *Subtype = AMDFAM15H_BDVER1;
+      break; // 00h-0Fh: Bulldozer
+    }
+    break;
+  case 22:
+    CPU = "btver2";
+    *Type = AMD_BTVER2;
+    break;
+  case 23:
+    CPU = "znver1";
+    *Type = AMDFAM17H;
+    if ((Model >= 0x30 && Model <= 0x3f) || (Model == 0x47) ||
+        (Model >= 0x60 && Model <= 0x67) || (Model >= 0x68 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x7f) || (Model >= 0x84 && Model <= 0x87) ||
+        (Model >= 0x90 && Model <= 0x97) || (Model >= 0x98 && Model <= 0x9f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 17h Models 30h-3Fh (Starship) Zen 2
+      // Family 17h Models 47h (Cardinal) Zen 2
+      // Family 17h Models 60h-67h (Renoir) Zen 2
+      // Family 17h Models 68h-6Fh (Lucienne) Zen 2
+      // Family 17h Models 70h-7Fh (Matisse) Zen 2
+      // Family 17h Models 84h-87h (ProjectX) Zen 2
+      // Family 17h Models 90h-97h (VanGogh) Zen 2
+      // Family 17h Models 98h-9Fh (Mero) Zen 2
+      // Family 17h Models A0h-AFh (Mendocino) Zen 2
+      CPU = "znver2";
+      *Subtype = AMDFAM17H_ZNVER2;
+      break;
+    }
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x20 && Model <= 0x2f)) {
+      // Family 17h Models 10h-1Fh (Raven1) Zen
+      // Family 17h Models 10h-1Fh (Picasso) Zen+
+      // Family 17h Models 20h-2Fh (Raven2 x86) Zen
+      *Subtype = AMDFAM17H_ZNVER1;
+      break;
+    }
+    break;
+  case 25:
+    CPU = "znver3";
+    *Type = AMDFAM19H;
+    if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x2f) ||
+        (Model >= 0x30 && Model <= 0x3f) || (Model >= 0x40 && Model <= 0x4f) ||
+        (Model >= 0x50 && Model <= 0x5f)) {
+      // Family 19h Models 00h-0Fh (Genesis, Chagall) Zen 3
+      // Family 19h Models 20h-2Fh (Vermeer) Zen 3
+      // Family 19h Models 30h-3Fh (Badami) Zen 3
+      // Family 19h Models 40h-4Fh (Rembrandt) Zen 3+
+      // Family 19h Models 50h-5Fh (Cezanne) Zen 3
+      *Subtype = AMDFAM19H_ZNVER3;
+      break;
+    }
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x77) || (Model >= 0x78 && Model <= 0x7f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 19h Models 10h-1Fh (Stones; Storm Peak) Zen 4
+      // Family 19h Models 60h-6Fh (Raphael) Zen 4
+      // Family 19h Models 70h-77h (Phoenix, Hawkpoint1) Zen 4
+      // Family 19h Models 78h-7Fh (Phoenix 2, Hawkpoint2) Zen 4
+      // Family 19h Models A0h-AFh (Stones-Dense) Zen 4
+      CPU = "znver4";
+      *Subtype = AMDFAM19H_ZNVER4;
+      break; //  "znver4"
+    }
+    break; // family 19h
+  default:
+    break; // Unknown AMD CPU.
+  }
+
+  return CPU;
+}
+
+#undef testFeature
+
+static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
+                                 unsigned *Features) {
+  unsigned EAX = 0, EBX = 0;
+
+#define hasFeature(F) ((Features[F / 32] >> (F % 32)) & 1)
+#define setFeature(F) Features[F / 32] |= 1U << (F % 32)
+
+  if ((EDX >> 15) & 1)
+    setFeature(FEATURE_CMOV);
+  if ((EDX >> 23) & 1)
+    setFeature(FEATURE_MMX);
+  if ((EDX >> 25) & 1)
+    setFeature(FEATURE_SSE);
+  if ((EDX >> 26) & 1)
+    setFeature(FEATURE_SSE2);
+
+  if ((ECX >> 0) & 1)
+    setFeature(FEATURE_SSE3);
+  if ((ECX >> 1) & 1)
+    setFeature(FEATURE_PCLMUL);
+  if ((ECX >> 9) & 1)
+    setFeature(FEATURE_SSSE3);
+  if ((ECX >> 12) & 1)
+    setFeature(FEATURE_FMA);
+  if ((ECX >> 13) & 1)
+    setFeature(FEATURE_CMPXCHG16B);
+  if ((ECX >> 19) & 1)
+    setFeature(FEATURE_SSE4_1);
+  if ((ECX >> 20) & 1)
+    setFeature(FEATURE_SSE4_2);
+  if ((ECX >> 22) & 1)
+    setFeature(FEATURE_MOVBE);
+  if ((ECX >> 23) & 1)
+    setFeature(FEATURE_POPCNT);
+  if ((ECX >> 25) & 1)
+    setFeature(FEATURE_AES);
+  if ((ECX >> 29) & 1)
+    setFeature(FEATURE_F16C);
+  if ((ECX >> 30) & 1)
+    setFeature(FEATURE_RDRND);
+
+  // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
+  // indicates that the AVX registers will be saved and restored on context
+  // switch, then we have full AVX support.
+  const unsigned AVXBits = (1 << 27) | (1 << 28);
+  bool HasAVXSave = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) &&
+                    ((EAX & 0x6) == 0x6);
+#if defined(__APPLE__)
+  // Darwin lazily saves the AVX512 context on first use: trust that the OS will
+  // save the AVX512 context if we use AVX512 instructions, even the bit is not
+  // set right now.
+  bool HasAVX512Save = true;
+#else
+  // AVX512 requires additional context to be saved by the OS.
+  bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0);
+#endif
+  // AMX requires additional context to be saved by the OS.
+  const unsigned AMXBits = (1 << 17) | (1 << 18);
+  bool HasXSave = ((ECX >> 27) & 1) && !getX86XCR0(&EAX, &EDX);
+  bool HasAMXSave = HasXSave && ((EAX & AMXBits) == AMXBits);
+
+  if (HasAVXSave)
+    setFeature(FEATURE_AVX);
+
+  if (((ECX >> 26) & 1) && HasAVXSave)
+    setFeature(FEATURE_XSAVE);
+
+  bool HasLeaf7 =
+      MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
+
+  if (HasLeaf7 && ((EBX >> 0) & 1))
+    setFeature(FEATURE_FSGSBASE);
+  if (HasLeaf7 && ((EBX >> 2) & 1))
+    setFeature(FEATURE_SGX);
+  if (HasLeaf7 && ((EBX >> 3) & 1))
+    setFeature(FEATURE_BMI);
+  if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVX2);
+  if (HasLeaf7 && ((EBX >> 8) & 1))
+    setFeature(FEATURE_BMI2);
+  if (HasLeaf7 && ((EBX >> 11) & 1))
+    setFeature(FEATURE_RTM);
+  if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512F);
+  if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512DQ);
+  if (HasLeaf7 && ((EBX >> 18) & 1))
+    setFeature(FEATURE_RDSEED);
+  if (HasLeaf7 && ((EBX >> 19) & 1))
+    setFeature(FEATURE_ADX);
+  if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512IFMA);
+  if (HasLeaf7 && ((EBX >> 24) & 1))
+    setFeature(FEATURE_CLWB);
+  if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512PF);
+  if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512ER);
+  if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512CD);
+  if (HasLeaf7 && ((EBX >> 29) & 1))
+    setFeature(FEATURE_SHA);
+  if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512BW);
+  if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512VL);
+
+  if (HasLeaf7 && ((ECX >> 0) & 1))
+    setFeature(FEATURE_PREFETCHWT1);
+  if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512VBMI);
+  if (HasLeaf7 && ((ECX >> 4) & 1))
+    setFeature(FEATURE_PKU);
+  if (HasLeaf7 && ((ECX >> 5) & 1))
+    setFeature(FEATURE_WAITPKG);
+  if (HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512VBMI2);
+  if (HasLeaf7 && ((ECX >> 7) & 1))
+    setFeature(FEATURE_SHSTK);
+  if (HasLeaf7 && ((ECX >> 8) & 1))
+    setFeature(FEATURE_GFNI);
+  if (HasLeaf7 && ((ECX >> 9) & 1) && HasAVXSave)
+    setFeature(FEATURE_VAES);
+  if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVXSave)
+    setFeature(FEATURE_VPCLMULQDQ);
+  if (HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512VNNI);
+  if (HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512BITALG);
+  if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512VPOPCNTDQ);
+  if (HasLeaf7 && ((ECX >> 22) & 1))
+    setFeature(FEATURE_RDPID);
+  if (HasLeaf7 && ((ECX >> 23) & 1))
+    setFeature(FEATURE_KL);
+  if (HasLeaf7 && ((ECX >> 25) & 1))
+    setFeature(FEATURE_CLDEMOTE);
+  if (HasLeaf7 && ((ECX >> 27) & 1))
+    setFeature(FEATURE_MOVDIRI);
+  if (HasLeaf7 && ((ECX >> 28) & 1))
+    setFeature(FEATURE_MOVDIR64B);
+  if (HasLeaf7 && ((ECX >> 29) & 1))
+    setFeature(FEATURE_ENQCMD);
+
+  if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX5124VNNIW);
+  if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX5124FMAPS);
+  if (HasLeaf7 && ((EDX >> 5) & 1))
+    setFeature(FEATURE_UINTR);
+  if (HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512VP2INTERSECT);
+  if (HasLeaf7 && ((EDX >> 14) & 1))
+    setFeature(FEATURE_SERIALIZE);
+  if (HasLeaf7 && ((EDX >> 16) & 1))
+    setFeature(FEATURE_TSXLDTRK);
+  if (HasLeaf7 && ((EDX >> 18) & 1))
+    setFeature(FEATURE_PCONFIG);
+  if (HasLeaf7 && ((EDX >> 22) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_BF16);
+  if (HasLeaf7 && ((EDX >> 23) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512FP16);
+  if (HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_TILE);
+  if (HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_INT8);
+
+  // EAX from subleaf 0 is the maximum subleaf supported. Some CPUs don't
+  // return all 0s for invalid subleaves so check the limit.
+  bool HasLeaf7Subleaf1 =
+      HasLeaf7 && EAX >= 1 &&
+      !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf7Subleaf1 && ((EAX >> 0) & 1))
+    setFeature(FEATURE_SHA512);
+  if (HasLeaf7Subleaf1 && ((EAX >> 1) & 1))
+    setFeature(FEATURE_SM3);
+  if (HasLeaf7Subleaf1 && ((EAX >> 2) & 1))
+    setFeature(FEATURE_SM4);
+  if (HasLeaf7Subleaf1 && ((EAX >> 3) & 1))
+    setFeature(FEATURE_RAOINT);
+  if (HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXVNNI);
+  if (HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save)
+    setFeature(FEATURE_AVX512BF16);
+  if (HasLeaf7Subleaf1 && ((EAX >> 7) & 1))
+    setFeature(FEATURE_CMPCCXADD);
+  if (HasLeaf7Subleaf1 && ((EAX >> 21) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_FP16);
+  if (HasLeaf7Subleaf1 && ((EAX >> 22) & 1))
+    setFeature(FEATURE_HRESET);
+  if (HasLeaf7Subleaf1 && ((EAX >> 23) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXIFMA);
+
+  if (HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXVNNIINT8);
+  if (HasLeaf7Subleaf1 && ((EDX >> 5) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXNECONVERT);
+  if (HasLeaf7Subleaf1 && ((EDX >> 8) & 1) && HasAMXSave)
+    setFeature(FEATURE_AMX_COMPLEX);
+  if (HasLeaf7Subleaf1 && ((EDX >> 10) & 1) && HasAVXSave)
+    setFeature(FEATURE_AVXVNNIINT16);
+  if (HasLeaf7Subleaf1 && ((EDX >> 14) & 1))
+    setFeature(FEATURE_PREFETCHI);
+  if (HasLeaf7Subleaf1 && ((EDX >> 15) & 1))
+    setFeature(FEATURE_USERMSR);
+  if (HasLeaf7Subleaf1 && ((EDX >> 21) & 1))
+    setFeature(FEATURE_APXF);
+
+  unsigned MaxLevel = 0;
+  getX86CpuIDAndInfo(0, &MaxLevel, &EBX, &ECX, &EDX);
+  bool HasLeafD = MaxLevel >= 0xd &&
+                  !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeafD && ((EAX >> 0) & 1) && HasAVXSave)
+    setFeature(FEATURE_XSAVEOPT);
+  if (HasLeafD && ((EAX >> 1) & 1) && HasAVXSave)
+    setFeature(FEATURE_XSAVEC);
+  if (HasLeafD && ((EAX >> 3) & 1) && HasAVXSave)
+    setFeature(FEATURE_XSAVES);
+
+  bool HasLeaf24 =
+      MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1) && HasLeaf24) {
+    bool Has512Len = (EBX >> 18) & 1;
+    int AVX10Ver = EBX & 0xff;
+    if (AVX10Ver >= 2) {
+      setFeature(FEATURE_AVX10_2_256);
+      if (Has512Len)
+        setFeature(FEATURE_AVX10_2_512);
+    }
+    if (AVX10Ver >= 1) {
+      setFeature(FEATURE_AVX10_1_256);
+      if (Has512Len)
+        setFeature(FEATURE_AVX10_1_512);
+    }
+  }
+
+  unsigned MaxExtLevel = 0;
+  getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
+
+  bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
+                     !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  if (HasExtLeaf1) {
+    if (ECX & 1)
+      setFeature(FEATURE_LAHF_LM);
+    if ((ECX >> 5) & 1)
+      setFeature(FEATURE_LZCNT);
+    if (((ECX >> 6) & 1))
+      setFeature(FEATURE_SSE4_A);
+    if (((ECX >> 8) & 1))
+      setFeature(FEATURE_PRFCHW);
+    if (((ECX >> 11) & 1))
+      setFeature(FEATURE_XOP);
+    if (((ECX >> 15) & 1))
+      setFeature(FEATURE_LWP);
+    if (((ECX >> 16) & 1))
+      setFeature(FEATURE_FMA4);
+    if (((ECX >> 21) & 1))
+      setFeature(FEATURE_TBM);
+    if (((ECX >> 29) & 1))
+      setFeature(FEATURE_MWAITX);
+
+    if (((EDX >> 29) & 1))
+      setFeature(FEATURE_LM);
+  }
+
+  bool HasExtLeaf8 = MaxExtLevel >= 0x80000008 &&
+                     !getX86CpuIDAndInfo(0x80000008, &EAX, &EBX, &ECX, &EDX);
+  if (HasExtLeaf8 && ((EBX >> 0) & 1))
+    setFeature(FEATURE_CLZERO);
+  if (HasExtLeaf8 && ((EBX >> 9) & 1))
+    setFeature(FEATURE_WBNOINVD);
+
+  bool HasLeaf14 = MaxLevel >= 0x14 &&
+                   !getX86CpuIDAndInfoEx(0x14, 0x0, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf14 && ((EBX >> 4) & 1))
+    setFeature(FEATURE_PTWRITE);
+
+  bool HasLeaf19 =
+      MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX);
+  if (HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1))
+    setFeature(FEATURE_WIDEKL);
+
+  if (hasFeature(FEATURE_LM) && hasFeature(FEATURE_SSE2)) {
+    setFeature(FEATURE_X86_64_BASELINE);
+    if (hasFeature(FEATURE_CMPXCHG16B) && hasFeature(FEATURE_POPCNT) &&
+        hasFeature(FEATURE_LAHF_LM) && hasFeature(FEATURE_SSE4_2)) {
+      setFeature(FEATURE_X86_64_V2);
+      if (hasFeature(FEATURE_AVX2) && hasFeature(FEATURE_BMI) &&
+          hasFeature(FEATURE_BMI2) && hasFeature(FEATURE_F16C) &&
+          hasFeature(FEATURE_FMA) && hasFeature(FEATURE_LZCNT) &&
+          hasFeature(FEATURE_MOVBE)) {
+        setFeature(FEATURE_X86_64_V3);
+        if (hasFeature(FEATURE_AVX512BW) && hasFeature(FEATURE_AVX512CD) &&
+            hasFeature(FEATURE_AVX512DQ) && hasFeature(FEATURE_AVX512VL))
+          setFeature(FEATURE_X86_64_V4);
+      }
+    }
+  }
+
+#undef hasFeature
+#undef setFeature
+}
+
+#ifndef _WIN32
+__attribute__((visibility("hidden")))
+#endif
+int __cpu_indicator_init(void) CONSTRUCTOR_ATTRIBUTE;
+
+#ifndef _WIN32
+__attribute__((visibility("hidden")))
+#endif
+struct __processor_model {
+  unsigned int __cpu_vendor;
+  unsigned int __cpu_type;
+  unsigned int __cpu_subtype;
+  unsigned int __cpu_features[1];
+} __cpu_model = {0, 0, 0, {0}};
+
+#ifndef _WIN32
+__attribute__((visibility("hidden")))
+#endif
+unsigned __cpu_features2[(CPU_FEATURE_MAX - 1) / 32];
+
+// A constructor function that is sets __cpu_model and __cpu_features2 with
+// the right values.  This needs to run only once.  This constructor is
+// given the highest priority and it should run before constructors without
+// the priority set.  However, it still runs after ifunc initializers and
+// needs to be called explicitly there.
+
+int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  unsigned MaxLeaf = 5;
+  unsigned Vendor;
+  unsigned Model, Family;
+  unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0};
+  static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
+  static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");
+
+  // This function needs to run just once.
+  if (__cpu_model.__cpu_vendor)
+    return 0;
+
+  if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1) {
+    __cpu_model.__cpu_vendor = VENDOR_OTHER;
+    return -1;
+  }
+
+  getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX);
+  detectX86FamilyModel(EAX, &Family, &Model);
+
+  // Find available features.
+  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features[0]);
+
+  __cpu_model.__cpu_features[0] = Features[0];
+  __cpu_features2[0] = Features[1];
+  __cpu_features2[1] = Features[2];
+  __cpu_features2[2] = Features[3];
+
+  if (Vendor == SIG_INTEL) {
+    // Get CPU type.
+    getIntelProcessorTypeAndSubtype(Family, Model, &Features[0],
+                                    &(__cpu_model.__cpu_type),
+                                    &(__cpu_model.__cpu_subtype));
+    __cpu_model.__cpu_vendor = VENDOR_INTEL;
+  } else if (Vendor == SIG_AMD) {
+    // Get CPU type.
+    getAMDProcessorTypeAndSubtype(Family, Model, &Features[0],
+                                  &(__cpu_model.__cpu_type),
+                                  &(__cpu_model.__cpu_subtype));
+    __cpu_model.__cpu_vendor = VENDOR_AMD;
+  } else
+    __cpu_model.__cpu_vendor = VENDOR_OTHER;
+
+  assert(__cpu_model.__cpu_vendor < VENDOR_MAX);
+  assert(__cpu_model.__cpu_type < CPU_TYPE_MAX);
+  assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX);
+
+  return 0;
+}
+#endif // defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
diff --git a/compiler-rt/lib/builtins/crtbegin.c b/compiler-rt/lib/builtins/crtbegin.c
new file mode 100644
index 0000000..d5f7756
--- /dev/null
+++ b/compiler-rt/lib/builtins/crtbegin.c
@@ -0,0 +1,171 @@
+//===-- crtbegin.c - Start of constructors and destructors ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+
+#if __has_feature(ptrauth_init_fini)
+#include <ptrauth.h>
+#endif
+
+__attribute__((visibility("hidden"))) void *__dso_handle = &__dso_handle;
+
+#ifdef EH_USE_FRAME_REGISTRY
+__extension__ static void *__EH_FRAME_LIST__[]
+    __attribute__((section(".eh_frame"), aligned(sizeof(void *)))) = {};
+
+extern void __register_frame_info(const void *, void *) __attribute__((weak));
+extern void *__deregister_frame_info(const void *) __attribute__((weak));
+#endif
+
+#ifndef CRT_HAS_INITFINI_ARRAY
+typedef void (*fp)(void);
+
+static fp __CTOR_LIST__[]
+    __attribute__((section(".ctors"), aligned(sizeof(fp)))) = {(fp)-1};
+extern fp __CTOR_LIST_END__[];
+#endif
+
+extern void __cxa_finalize(void *) __attribute__((weak));
+
+static void __attribute__((used)) __do_init(void) {
+  static _Bool __initialized;
+  if (__builtin_expect(__initialized, 0))
+    return;
+  __initialized = 1;
+
+#ifdef EH_USE_FRAME_REGISTRY
+  static struct { void *p[8]; } __object;
+  if (__register_frame_info)
+    __register_frame_info(__EH_FRAME_LIST__, &__object);
+#endif
+#ifndef CRT_HAS_INITFINI_ARRAY
+  const size_t n = __CTOR_LIST_END__ - __CTOR_LIST__ - 1;
+  for (size_t i = n; i >= 1; i--) __CTOR_LIST__[i]();
+#endif
+}
+
+#ifdef CRT_HAS_INITFINI_ARRAY
+#if __has_feature(ptrauth_init_fini)
+// TODO: use __ptrauth-qualified pointers when they are supported on clang side
+#if __has_feature(ptrauth_init_fini_address_discrimination)
+__attribute__((section(".init_array"), used)) static void *__init =
+    ptrauth_sign_constant(&__do_init, ptrauth_key_init_fini_pointer,
+                          ptrauth_blend_discriminator(
+                              &__init, __ptrauth_init_fini_discriminator));
+#else
+__attribute__((section(".init_array"), used)) static void *__init =
+    ptrauth_sign_constant(&__do_init, ptrauth_key_init_fini_pointer,
+                          __ptrauth_init_fini_discriminator);
+#endif
+#else
+__attribute__((section(".init_array"),
+               used)) static void (*__init)(void) = __do_init;
+#endif
+#elif defined(__i386__) || defined(__x86_64__)
+__asm__(".pushsection .init,\"ax\",@progbits\n\t"
+        "call __do_init\n\t"
+        ".popsection");
+#elif defined(__riscv)
+__asm__(".pushsection .init,\"ax\",%progbits\n\t"
+        "call __do_init\n\t"
+        ".popsection");
+#elif defined(__arm__) || defined(__aarch64__)
+__asm__(".pushsection .init,\"ax\",%progbits\n\t"
+        "bl __do_init\n\t"
+        ".popsection");
+#elif defined(__mips__)
+__asm__(".pushsection .init,\"ax\",@progbits\n\t"
+        "jal __do_init\n\t"
+        ".popsection");
+#elif defined(__powerpc__) || defined(__powerpc64__)
+__asm__(".pushsection .init,\"ax\",@progbits\n\t"
+        "bl __do_init\n\t"
+        "nop\n\t"
+        ".popsection");
+#elif defined(__sparc__)
+__asm__(".pushsection .init,\"ax\",@progbits\n\t"
+        "call __do_init\n\t"
+        ".popsection");
+#else
+#error "crtbegin without .init_fini array unimplemented for this architecture"
+#endif // CRT_HAS_INITFINI_ARRAY
+
+#ifndef CRT_HAS_INITFINI_ARRAY
+static fp __DTOR_LIST__[]
+    __attribute__((section(".dtors"), aligned(sizeof(fp)))) = {(fp)-1};
+extern fp __DTOR_LIST_END__[];
+#endif
+
+static void __attribute__((used)) __do_fini(void) {
+  static _Bool __finalized;
+  if (__builtin_expect(__finalized, 0))
+    return;
+  __finalized = 1;
+
+  if (__cxa_finalize)
+    __cxa_finalize(__dso_handle);
+
+#ifndef CRT_HAS_INITFINI_ARRAY
+  const size_t n = __DTOR_LIST_END__ - __DTOR_LIST__ - 1;
+  for (size_t i = 1; i <= n; i++) __DTOR_LIST__[i]();
+#endif
+#ifdef EH_USE_FRAME_REGISTRY
+  if (__deregister_frame_info)
+    __deregister_frame_info(__EH_FRAME_LIST__);
+#endif
+}
+
+#ifdef CRT_HAS_INITFINI_ARRAY
+#if __has_feature(ptrauth_init_fini)
+// TODO: use __ptrauth-qualified pointers when they are supported on clang side
+#if __has_feature(ptrauth_init_fini_address_discrimination)
+__attribute__((section(".fini_array"), used)) static void *__fini =
+    ptrauth_sign_constant(&__do_fini, ptrauth_key_init_fini_pointer,
+                          ptrauth_blend_discriminator(
+                              &__fini, __ptrauth_init_fini_discriminator));
+#else
+__attribute__((section(".fini_array"), used)) static void *__fini =
+    ptrauth_sign_constant(&__do_fini, ptrauth_key_init_fini_pointer,
+                          __ptrauth_init_fini_discriminator);
+#endif
+#else
+__attribute__((section(".fini_array"),
+               used)) static void (*__fini)(void) = __do_fini;
+#endif
+#elif defined(__i386__) || defined(__x86_64__)
+__asm__(".pushsection .fini,\"ax\",@progbits\n\t"
+        "call __do_fini\n\t"
+        ".popsection");
+#elif defined(__arm__) || defined(__aarch64__)
+__asm__(".pushsection .fini,\"ax\",%progbits\n\t"
+        "bl __do_fini\n\t"
+        ".popsection");
+#elif defined(__mips__)
+__asm__(".pushsection .fini,\"ax\",@progbits\n\t"
+        "jal __do_fini\n\t"
+        ".popsection");
+#elif defined(__powerpc__) || defined(__powerpc64__)
+__asm__(".pushsection .fini,\"ax\",@progbits\n\t"
+        "bl __do_fini\n\t"
+        "nop\n\t"
+        ".popsection");
+#elif defined(__riscv)
+__asm__(".pushsection .fini,\"ax\",@progbits\n\t"
+        "call __do_fini\n\t"
+        ".popsection");
+#elif defined(__sparc__)
+__asm__(".pushsection .fini,\"ax\",@progbits\n\t"
+        "call __do_fini\n\t"
+        ".popsection");
+#else
+#error "crtbegin without .init_fini array unimplemented for this architecture"
+#endif  // CRT_HAS_INIT_FINI_ARRAY
diff --git a/compiler-rt/lib/builtins/crtend.c b/compiler-rt/lib/builtins/crtend.c
new file mode 100644
index 0000000..ebcc60b
--- /dev/null
+++ b/compiler-rt/lib/builtins/crtend.c
@@ -0,0 +1,22 @@
+//===-- crtend.c - End of constructors and destructors --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+// Put 4-byte zero which is the length field in FDE at the end as a terminator.
+const int32_t __EH_FRAME_LIST_END__[]
+    __attribute__((section(".eh_frame"), aligned(sizeof(int32_t)),
+                   visibility("hidden"), used)) = {0};
+
+#ifndef CRT_HAS_INITFINI_ARRAY
+typedef void (*fp)(void);
+fp __CTOR_LIST_END__[]
+    __attribute__((section(".ctors"), visibility("hidden"), used)) = {0};
+fp __DTOR_LIST_END__[]
+    __attribute__((section(".dtors"), visibility("hidden"), used)) = {0};
+#endif
diff --git a/compiler-rt/lib/builtins/divmoddi4.c b/compiler-rt/lib/builtins/divmoddi4.c
index e7cbbb1..64bbb69 100644
--- a/compiler-rt/lib/builtins/divmoddi4.c
+++ b/compiler-rt/lib/builtins/divmoddi4.c
@@ -18,8 +18,8 @@ COMPILER_RT_ABI di_int __divmoddi4(di_int a, di_int b, di_int *rem) {
   const int bits_in_dword_m1 = (int)(sizeof(di_int) * CHAR_BIT) - 1;
   di_int s_a = a >> bits_in_dword_m1;                   // s_a = a < 0 ? -1 : 0
   di_int s_b = b >> bits_in_dword_m1;                   // s_b = b < 0 ? -1 : 0
-  a = (a ^ s_a) - s_a;                                  // negate if s_a == -1
-  b = (b ^ s_b) - s_b;                                  // negate if s_b == -1
+  a = (du_int)(a ^ s_a) - s_a;                          // negate if s_a == -1
+  b = (du_int)(b ^ s_b) - s_b;                          // negate if s_b == -1
   s_b ^= s_a;                                           // sign of quotient
   du_int r;
   di_int q = (__udivmoddi4(a, b, &r) ^ s_b) - s_b;      // negate if s_b == -1
diff --git a/compiler-rt/lib/builtins/divmodsi4.c b/compiler-rt/lib/builtins/divmodsi4.c
index a85e299..193f810 100644
--- a/compiler-rt/lib/builtins/divmodsi4.c
+++ b/compiler-rt/lib/builtins/divmodsi4.c
@@ -19,8 +19,8 @@ COMPILER_RT_ABI si_int __divmodsi4(si_int a, si_int b, si_int *rem) {
   const int bits_in_word_m1 = (int)(sizeof(si_int) * CHAR_BIT) - 1;
   si_int s_a = a >> bits_in_word_m1;                    // s_a = a < 0 ? -1 : 0
   si_int s_b = b >> bits_in_word_m1;                    // s_b = b < 0 ? -1 : 0
-  a = (a ^ s_a) - s_a;                                  // negate if s_a == -1
-  b = (b ^ s_b) - s_b;                                  // negate if s_b == -1
+  a = (su_int)(a ^ s_a) - s_a;                          // negate if s_a == -1
+  b = (su_int)(b ^ s_b) - s_b;                          // negate if s_b == -1
   s_b ^= s_a;                                           // sign of quotient
   su_int r;
   si_int q = (__udivmodsi4(a, b, &r) ^ s_b) - s_b;      // negate if s_b == -1
diff --git a/compiler-rt/lib/builtins/divmodti4.c b/compiler-rt/lib/builtins/divmodti4.c
index b243ba4..185d3d4 100644
--- a/compiler-rt/lib/builtins/divmodti4.c
+++ b/compiler-rt/lib/builtins/divmodti4.c
@@ -20,8 +20,8 @@ COMPILER_RT_ABI ti_int __divmodti4(ti_int a, ti_int b, ti_int *rem) {
   const int bits_in_tword_m1 = (int)(sizeof(ti_int) * CHAR_BIT) - 1;
   ti_int s_a = a >> bits_in_tword_m1;                   // s_a = a < 0 ? -1 : 0
   ti_int s_b = b >> bits_in_tword_m1;                   // s_b = b < 0 ? -1 : 0
-  a = (a ^ s_a) - s_a;                                  // negate if s_a == -1
-  b = (b ^ s_b) - s_b;                                  // negate if s_b == -1
+  a = (tu_int)(a ^ s_a) - s_a;                          // negate if s_a == -1
+  b = (tu_int)(b ^ s_b) - s_b;                          // negate if s_b == -1
   s_b ^= s_a;                                           // sign of quotient
   tu_int r;
   ti_int q = (__udivmodti4(a, b, &r) ^ s_b) - s_b;      // negate if s_b == -1
diff --git a/compiler-rt/lib/builtins/divtc3.c b/compiler-rt/lib/builtins/divtc3.c
index 0e47992..c393de8 100644
--- a/compiler-rt/lib/builtins/divtc3.c
+++ b/compiler-rt/lib/builtins/divtc3.c
@@ -12,44 +12,45 @@
 
 #define QUAD_PRECISION
 #include "fp_lib.h"
-#include "int_lib.h"
-#include "int_math.h"
+
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
 
 // Returns: the quotient of (a + ib) / (c + id)
 
-COMPILER_RT_ABI Lcomplex __divtc3(long double __a, long double __b,
-                                  long double __c, long double __d) {
+COMPILER_RT_ABI Qcomplex __divtc3(fp_t __a, fp_t __b, fp_t __c, fp_t __d) {
   int __ilogbw = 0;
-  long double __logbw =
-      __compiler_rt_logbl(__compiler_rt_fmaxl(crt_fabsl(__c), crt_fabsl(__d)));
+  fp_t __logbw = __compiler_rt_logbtf(
+      __compiler_rt_fmaxtf(crt_fabstf(__c), crt_fabstf(__d)));
   if (crt_isfinite(__logbw)) {
     __ilogbw = (int)__logbw;
-    __c = __compiler_rt_scalbnl(__c, -__ilogbw);
-    __d = __compiler_rt_scalbnl(__d, -__ilogbw);
+    __c = __compiler_rt_scalbntf(__c, -__ilogbw);
+    __d = __compiler_rt_scalbntf(__d, -__ilogbw);
   }
-  long double __denom = __c * __c + __d * __d;
-  Lcomplex z;
-  COMPLEX_REAL(z) =
-      __compiler_rt_scalbnl((__a * __c + __b * __d) / __denom, -__ilogbw);
-  COMPLEX_IMAGINARY(z) =
-      __compiler_rt_scalbnl((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (crt_isnan(COMPLEX_REAL(z)) && crt_isnan(COMPLEX_IMAGINARY(z))) {
+  fp_t __denom = __c * __c + __d * __d;
+  Qcomplex z;
+  COMPLEXTF_REAL(z) =
+      __compiler_rt_scalbntf((__a * __c + __b * __d) / __denom, -__ilogbw);
+  COMPLEXTF_IMAGINARY(z) =
+      __compiler_rt_scalbntf((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (crt_isnan(COMPLEXTF_REAL(z)) && crt_isnan(COMPLEXTF_IMAGINARY(z))) {
     if ((__denom == 0.0) && (!crt_isnan(__a) || !crt_isnan(__b))) {
-      COMPLEX_REAL(z) = crt_copysignl(CRT_INFINITY, __c) * __a;
-      COMPLEX_IMAGINARY(z) = crt_copysignl(CRT_INFINITY, __c) * __b;
+      COMPLEXTF_REAL(z) = crt_copysigntf(CRT_INFINITY, __c) * __a;
+      COMPLEXTF_IMAGINARY(z) = crt_copysigntf(CRT_INFINITY, __c) * __b;
     } else if ((crt_isinf(__a) || crt_isinf(__b)) && crt_isfinite(__c) &&
                crt_isfinite(__d)) {
-      __a = crt_copysignl(crt_isinf(__a) ? 1.0 : 0.0, __a);
-      __b = crt_copysignl(crt_isinf(__b) ? 1.0 : 0.0, __b);
-      COMPLEX_REAL(z) = CRT_INFINITY * (__a * __c + __b * __d);
-      COMPLEX_IMAGINARY(z) = CRT_INFINITY * (__b * __c - __a * __d);
+      __a = crt_copysigntf(crt_isinf(__a) ? (fp_t)1.0 : (fp_t)0.0, __a);
+      __b = crt_copysigntf(crt_isinf(__b) ? (fp_t)1.0 : (fp_t)0.0, __b);
+      COMPLEXTF_REAL(z) = CRT_INFINITY * (__a * __c + __b * __d);
+      COMPLEXTF_IMAGINARY(z) = CRT_INFINITY * (__b * __c - __a * __d);
     } else if (crt_isinf(__logbw) && __logbw > 0.0 && crt_isfinite(__a) &&
                crt_isfinite(__b)) {
-      __c = crt_copysignl(crt_isinf(__c) ? 1.0 : 0.0, __c);
-      __d = crt_copysignl(crt_isinf(__d) ? 1.0 : 0.0, __d);
-      COMPLEX_REAL(z) = 0.0 * (__a * __c + __b * __d);
-      COMPLEX_IMAGINARY(z) = 0.0 * (__b * __c - __a * __d);
+      __c = crt_copysigntf(crt_isinf(__c) ? (fp_t)1.0 : (fp_t)0.0, __c);
+      __d = crt_copysigntf(crt_isinf(__d) ? (fp_t)1.0 : (fp_t)0.0, __d);
+      COMPLEXTF_REAL(z) = 0.0 * (__a * __c + __b * __d);
+      COMPLEXTF_IMAGINARY(z) = 0.0 * (__b * __c - __a * __d);
     }
   }
   return z;
 }
+
+#endif
diff --git a/compiler-rt/lib/builtins/divtf3.c b/compiler-rt/lib/builtins/divtf3.c
index 5bcc9a8..bd76763 100644
--- a/compiler-rt/lib/builtins/divtf3.c
+++ b/compiler-rt/lib/builtins/divtf3.c
@@ -14,7 +14,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 
 #define NUMBER_OF_HALF_ITERATIONS 4
 #define NUMBER_OF_FULL_ITERATIONS 1
diff --git a/compiler-rt/lib/builtins/divxc3.c b/compiler-rt/lib/builtins/divxc3.c
index 97ffd2e..3423334 100644
--- a/compiler-rt/lib/builtins/divxc3.c
+++ b/compiler-rt/lib/builtins/divxc3.c
@@ -17,16 +17,16 @@
 
 // Returns: the quotient of (a + ib) / (c + id)
 
-COMPILER_RT_ABI Lcomplex __divxc3(long double __a, long double __b,
-                                  long double __c, long double __d) {
+COMPILER_RT_ABI Lcomplex __divxc3(xf_float __a, xf_float __b, xf_float __c,
+                                  xf_float __d) {
   int __ilogbw = 0;
-  long double __logbw = crt_logbl(crt_fmaxl(crt_fabsl(__c), crt_fabsl(__d)));
+  xf_float __logbw = crt_logbl(crt_fmaxl(crt_fabsl(__c), crt_fabsl(__d)));
   if (crt_isfinite(__logbw)) {
     __ilogbw = (int)__logbw;
     __c = crt_scalbnl(__c, -__ilogbw);
     __d = crt_scalbnl(__d, -__ilogbw);
   }
-  long double __denom = __c * __c + __d * __d;
+  xf_float __denom = __c * __c + __d * __d;
   Lcomplex z;
   COMPLEX_REAL(z) = crt_scalbnl((__a * __c + __b * __d) / __denom, -__ilogbw);
   COMPLEX_IMAGINARY(z) =
diff --git a/compiler-rt/lib/builtins/extendbfsf2.c b/compiler-rt/lib/builtins/extendbfsf2.c
new file mode 100644
index 0000000..e159d79
--- /dev/null
+++ b/compiler-rt/lib/builtins/extendbfsf2.c
@@ -0,0 +1,13 @@
+//===-- lib/extendbfsf2.c - bfloat -> single conversion -----------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SRC_BFLOAT16
+#define DST_SINGLE
+#include "fp_extend_impl.inc"
+
+COMPILER_RT_ABI float __extendbfsf2(src_t a) { return __extendXfYf2__(a); }
diff --git a/compiler-rt/lib/builtins/extenddftf2.c b/compiler-rt/lib/builtins/extenddftf2.c
index ddf470e..a61ef53 100644
--- a/compiler-rt/lib/builtins/extenddftf2.c
+++ b/compiler-rt/lib/builtins/extenddftf2.c
@@ -9,13 +9,11 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 #define SRC_DOUBLE
 #define DST_QUAD
 #include "fp_extend_impl.inc"
 
-COMPILER_RT_ABI fp_t __extenddftf2(double a) {
-  return __extendXfYf2__(a);
-}
+COMPILER_RT_ABI dst_t __extenddftf2(src_t a) { return __extendXfYf2__(a); }
 
 #endif
diff --git a/compiler-rt/lib/builtins/extendhftf2.c b/compiler-rt/lib/builtins/extendhftf2.c
index aefe973..7609db6 100644
--- a/compiler-rt/lib/builtins/extendhftf2.c
+++ b/compiler-rt/lib/builtins/extendhftf2.c
@@ -10,14 +10,11 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) &&                     \
-    defined(COMPILER_RT_HAS_FLOAT16)
+#if defined(CRT_HAS_TF_MODE) && defined(COMPILER_RT_HAS_FLOAT16)
 #define SRC_HALF
 #define DST_QUAD
 #include "fp_extend_impl.inc"
 
-COMPILER_RT_ABI long double __extendhftf2(_Float16 a) {
-  return __extendXfYf2__(a);
-}
+COMPILER_RT_ABI dst_t __extendhftf2(src_t a) { return __extendXfYf2__(a); }
 
 #endif
diff --git a/compiler-rt/lib/builtins/extendsftf2.c b/compiler-rt/lib/builtins/extendsftf2.c
index cf1fd2f..4ab2982 100644
--- a/compiler-rt/lib/builtins/extendsftf2.c
+++ b/compiler-rt/lib/builtins/extendsftf2.c
@@ -9,13 +9,11 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 #define SRC_SINGLE
 #define DST_QUAD
 #include "fp_extend_impl.inc"
 
-COMPILER_RT_ABI fp_t __extendsftf2(float a) {
-  return __extendXfYf2__(a);
-}
+COMPILER_RT_ABI dst_t __extendsftf2(src_t a) { return __extendXfYf2__(a); }
 
 #endif
diff --git a/compiler-rt/lib/builtins/extendxftf2.c b/compiler-rt/lib/builtins/extendxftf2.c
new file mode 100644
index 0000000..c1d97b5
--- /dev/null
+++ b/compiler-rt/lib/builtins/extendxftf2.c
@@ -0,0 +1,24 @@
+//===-- lib/extendxftf2.c - long double -> quad conversion --------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits.
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && __LDBL_MANT_DIG__ == 64 && defined(__x86_64__)
+#define SRC_80
+#define DST_QUAD
+#include "fp_extend_impl.inc"
+
+COMPILER_RT_ABI tf_float __extendxftf2(xf_float a) {
+  return __extendXfYf2__(a);
+}
+
+#endif
diff --git a/compiler-rt/lib/builtins/fixtfdi.c b/compiler-rt/lib/builtins/fixtfdi.c
index fe570e6..d27a99b 100644
--- a/compiler-rt/lib/builtins/fixtfdi.c
+++ b/compiler-rt/lib/builtins/fixtfdi.c
@@ -9,7 +9,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 typedef di_int fixint_t;
 typedef du_int fixuint_t;
 #include "fp_fixint_impl.inc"
diff --git a/compiler-rt/lib/builtins/fixtfsi.c b/compiler-rt/lib/builtins/fixtfsi.c
index a32bd96..01e352a 100644
--- a/compiler-rt/lib/builtins/fixtfsi.c
+++ b/compiler-rt/lib/builtins/fixtfsi.c
@@ -9,7 +9,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 typedef si_int fixint_t;
 typedef su_int fixuint_t;
 #include "fp_fixint_impl.inc"
diff --git a/compiler-rt/lib/builtins/fixtfti.c b/compiler-rt/lib/builtins/fixtfti.c
index 19f84ce..491fca5 100644
--- a/compiler-rt/lib/builtins/fixtfti.c
+++ b/compiler-rt/lib/builtins/fixtfti.c
@@ -9,7 +9,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 typedef ti_int fixint_t;
 typedef tu_int fixuint_t;
 #include "fp_fixint_impl.inc"
diff --git a/compiler-rt/lib/builtins/fixunstfdi.c b/compiler-rt/lib/builtins/fixunstfdi.c
index a0805e6..febdb8f 100644
--- a/compiler-rt/lib/builtins/fixunstfdi.c
+++ b/compiler-rt/lib/builtins/fixunstfdi.c
@@ -9,7 +9,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 typedef du_int fixuint_t;
 #include "fp_fixuint_impl.inc"
 
diff --git a/compiler-rt/lib/builtins/fixunstfsi.c b/compiler-rt/lib/builtins/fixunstfsi.c
index 3a1320e..4efc387 100644
--- a/compiler-rt/lib/builtins/fixunstfsi.c
+++ b/compiler-rt/lib/builtins/fixunstfsi.c
@@ -9,7 +9,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 typedef su_int fixuint_t;
 #include "fp_fixuint_impl.inc"
 
diff --git a/compiler-rt/lib/builtins/fixunstfti.c b/compiler-rt/lib/builtins/fixunstfti.c
index 23cd1ab..fa9e7aa 100644
--- a/compiler-rt/lib/builtins/fixunstfti.c
+++ b/compiler-rt/lib/builtins/fixunstfti.c
@@ -9,7 +9,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 typedef tu_int fixuint_t;
 #include "fp_fixuint_impl.inc"
 
diff --git a/compiler-rt/lib/builtins/fixunsxfdi.c b/compiler-rt/lib/builtins/fixunsxfdi.c
index c8a8061..957c263 100644
--- a/compiler-rt/lib/builtins/fixunsxfdi.c
+++ b/compiler-rt/lib/builtins/fixunsxfdi.c
@@ -32,8 +32,8 @@
 #pragma warning(disable : 4700)
 #endif
 
-COMPILER_RT_ABI du_int __fixunsxfdi(long double a) {
-  long_double_bits fb;
+COMPILER_RT_ABI du_int __fixunsxfdi(xf_float a) {
+  xf_bits fb;
   fb.f = a;
   int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
   if (e < 0 || (fb.u.high.s.low & 0x00008000))
diff --git a/compiler-rt/lib/builtins/fixunsxfsi.c b/compiler-rt/lib/builtins/fixunsxfsi.c
index 154abcb..a0abb82 100644
--- a/compiler-rt/lib/builtins/fixunsxfsi.c
+++ b/compiler-rt/lib/builtins/fixunsxfsi.c
@@ -32,8 +32,8 @@
 #pragma warning(disable : 4700)
 #endif
 
-COMPILER_RT_ABI su_int __fixunsxfsi(long double a) {
-  long_double_bits fb;
+COMPILER_RT_ABI su_int __fixunsxfsi(xf_float a) {
+  xf_bits fb;
   fb.f = a;
   int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
   if (e < 0 || (fb.u.high.s.low & 0x00008000))
diff --git a/compiler-rt/lib/builtins/fixunsxfti.c b/compiler-rt/lib/builtins/fixunsxfti.c
index 508554e..be3f75f 100644
--- a/compiler-rt/lib/builtins/fixunsxfti.c
+++ b/compiler-rt/lib/builtins/fixunsxfti.c
@@ -25,8 +25,8 @@
 // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
 // mmmm mmmm mmmm
 
-COMPILER_RT_ABI tu_int __fixunsxfti(long double a) {
-  long_double_bits fb;
+COMPILER_RT_ABI tu_int __fixunsxfti(xf_float a) {
+  xf_bits fb;
   fb.f = a;
   int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
   if (e < 0 || (fb.u.high.s.low & 0x00008000))
diff --git a/compiler-rt/lib/builtins/fixxfdi.c b/compiler-rt/lib/builtins/fixxfdi.c
index 86cf376..35d7083 100644
--- a/compiler-rt/lib/builtins/fixxfdi.c
+++ b/compiler-rt/lib/builtins/fixxfdi.c
@@ -31,10 +31,10 @@
 #pragma warning(disable : 4700)
 #endif
 
-COMPILER_RT_ABI di_int __fixxfdi(long double a) {
+COMPILER_RT_ABI di_int __fixxfdi(xf_float a) {
   const di_int di_max = (di_int)((~(du_int)0) / 2);
   const di_int di_min = -di_max - 1;
-  long_double_bits fb;
+  xf_bits fb;
   fb.f = a;
   int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
   if (e < 0)
diff --git a/compiler-rt/lib/builtins/fixxfti.c b/compiler-rt/lib/builtins/fixxfti.c
index 90e0311..95038df 100644
--- a/compiler-rt/lib/builtins/fixxfti.c
+++ b/compiler-rt/lib/builtins/fixxfti.c
@@ -24,10 +24,10 @@
 // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
 // mmmm mmmm mmmm
 
-COMPILER_RT_ABI ti_int __fixxfti(long double a) {
+COMPILER_RT_ABI ti_int __fixxfti(xf_float a) {
   const ti_int ti_max = (ti_int)((~(tu_int)0) / 2);
   const ti_int ti_min = -ti_max - 1;
-  long_double_bits fb;
+  xf_bits fb;
   fb.f = a;
   int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
   if (e < 0)
diff --git a/compiler-rt/lib/builtins/floatdidf.c b/compiler-rt/lib/builtins/floatdidf.c
index d37c43b..6da81f7 100644
--- a/compiler-rt/lib/builtins/floatdidf.c
+++ b/compiler-rt/lib/builtins/floatdidf.c
@@ -45,53 +45,11 @@ COMPILER_RT_ABI double __floatdidf(di_int a) {
 // flags to set, and we don't want to code-gen to an unknown soft-float
 // implementation.
 
-COMPILER_RT_ABI double __floatdidf(di_int a) {
-  if (a == 0)
-    return 0.0;
-  const unsigned N = sizeof(di_int) * CHAR_BIT;
-  const di_int s = a >> (N - 1);
-  a = (a ^ s) - s;
-  int sd = N - __builtin_clzll(a); // number of significant digits
-  int e = sd - 1;                  // exponent
-  if (sd > DBL_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit DBL_MANT_DIG-1 bits to the right of 1
-    // Q = bit DBL_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case DBL_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case DBL_MANT_DIG + 2:
-      break;
-    default:
-      a = ((du_int)a >> (sd - (DBL_MANT_DIG + 2))) |
-          ((a & ((du_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to DBL_MANT_DIG or DBL_MANT_DIG+1 bits
-    if (a & ((du_int)1 << DBL_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to DBL_MANT_DIG bits
-  } else {
-    a <<= (DBL_MANT_DIG - sd);
-    // a is now rounded to DBL_MANT_DIG bits
-  }
-  double_bits fb;
-  fb.u.s.high = ((su_int)s & 0x80000000) |        // sign
-                ((su_int)(e + 1023) << 20) |      // exponent
-                ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high
-  fb.u.s.low = (su_int)a;                         // mantissa-low
-  return fb.f;
-}
+#define SRC_I64
+#define DST_DOUBLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI double __floatdidf(di_int a) { return __floatXiYf__(a); }
 #endif
 
 #if defined(__ARM_EABI__)
diff --git a/compiler-rt/lib/builtins/floatdisf.c b/compiler-rt/lib/builtins/floatdisf.c
index 5c63164..0bb88c5 100644
--- a/compiler-rt/lib/builtins/floatdisf.c
+++ b/compiler-rt/lib/builtins/floatdisf.c
@@ -19,52 +19,11 @@
 
 #include "int_lib.h"
 
-COMPILER_RT_ABI float __floatdisf(di_int a) {
-  if (a == 0)
-    return 0.0F;
-  const unsigned N = sizeof(di_int) * CHAR_BIT;
-  const di_int s = a >> (N - 1);
-  a = (a ^ s) - s;
-  int sd = N - __builtin_clzll(a); // number of significant digits
-  si_int e = sd - 1;               // exponent
-  if (sd > FLT_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit FLT_MANT_DIG-1 bits to the right of 1
-    //  Q = bit FLT_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case FLT_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case FLT_MANT_DIG + 2:
-      break;
-    default:
-      a = ((du_int)a >> (sd - (FLT_MANT_DIG + 2))) |
-          ((a & ((du_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits
-    if (a & ((du_int)1 << FLT_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to FLT_MANT_DIG bits
-  } else {
-    a <<= (FLT_MANT_DIG - sd);
-    // a is now rounded to FLT_MANT_DIG bits
-  }
-  float_bits fb;
-  fb.u = ((su_int)s & 0x80000000) | // sign
-         ((e + 127) << 23) |        // exponent
-         ((su_int)a & 0x007FFFFF);  // mantissa
-  return fb.f;
-}
+#define SRC_I64
+#define DST_SINGLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI float __floatdisf(di_int a) { return __floatXiYf__(a); }
 
 #if defined(__ARM_EABI__)
 #if defined(COMPILER_RT_ARMHF_TARGET)
diff --git a/compiler-rt/lib/builtins/floatditf.c b/compiler-rt/lib/builtins/floatditf.c
index 9b07b65..c6e326a 100644
--- a/compiler-rt/lib/builtins/floatditf.c
+++ b/compiler-rt/lib/builtins/floatditf.c
@@ -15,7 +15,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 COMPILER_RT_ABI fp_t __floatditf(di_int a) {
 
   const int aWidth = sizeof a * CHAR_BIT;
diff --git a/compiler-rt/lib/builtins/floatdixf.c b/compiler-rt/lib/builtins/floatdixf.c
index ad5deb2..3d9e664 100644
--- a/compiler-rt/lib/builtins/floatdixf.c
+++ b/compiler-rt/lib/builtins/floatdixf.c
@@ -23,7 +23,7 @@
 // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
 // mmmm mmmm mmmm
 
-COMPILER_RT_ABI long double __floatdixf(di_int a) {
+COMPILER_RT_ABI xf_float __floatdixf(di_int a) {
   if (a == 0)
     return 0.0;
   const unsigned N = sizeof(di_int) * CHAR_BIT;
@@ -31,7 +31,7 @@ COMPILER_RT_ABI long double __floatdixf(di_int a) {
   a = (a ^ s) - s;
   int clz = __builtin_clzll(a);
   int e = (N - 1) - clz; // exponent
-  long_double_bits fb;
+  xf_bits fb;
   fb.u.high.s.low = ((su_int)s & 0x00008000) | // sign
                     (e + 16383);               // exponent
   fb.u.low.all = a << clz;                     // mantissa
diff --git a/compiler-rt/lib/builtins/floatsitf.c b/compiler-rt/lib/builtins/floatsitf.c
index 92f207a..314a8a7 100644
--- a/compiler-rt/lib/builtins/floatsitf.c
+++ b/compiler-rt/lib/builtins/floatsitf.c
@@ -15,7 +15,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 COMPILER_RT_ABI fp_t __floatsitf(si_int a) {
 
   const int aWidth = sizeof a * CHAR_BIT;
diff --git a/compiler-rt/lib/builtins/floattidf.c b/compiler-rt/lib/builtins/floattidf.c
index 0a1c04b..ef8fe18 100644
--- a/compiler-rt/lib/builtins/floattidf.c
+++ b/compiler-rt/lib/builtins/floattidf.c
@@ -14,6 +14,10 @@
 
 #ifdef CRT_HAS_128BIT
 
+#define SRC_I128
+#define DST_DOUBLE
+#include "int_to_fp_impl.inc"
+
 // Returns: convert a to a double, rounding toward even.
 
 // Assumption: double is a IEEE 64 bit floating point type
@@ -22,52 +26,6 @@
 // seee eeee eeee mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm
 // mmmm
 
-COMPILER_RT_ABI double __floattidf(ti_int a) {
-  if (a == 0)
-    return 0.0;
-  const unsigned N = sizeof(ti_int) * CHAR_BIT;
-  const ti_int s = a >> (N - 1);
-  a = (a ^ s) - s;
-  int sd = N - __clzti2(a); // number of significant digits
-  int e = sd - 1;           // exponent
-  if (sd > DBL_MANT_DIG) {
-    // start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                               12345678901234567890123456
-    // 1 = msb 1 bit
-    // P = bit DBL_MANT_DIG-1 bits to the right of 1
-    // Q = bit DBL_MANT_DIG bits to the right of 1
-    // R = "or" of all bits to the right of Q
-    switch (sd) {
-    case DBL_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case DBL_MANT_DIG + 2:
-      break;
-    default:
-      a = ((tu_int)a >> (sd - (DBL_MANT_DIG + 2))) |
-          ((a & ((tu_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to DBL_MANT_DIG or DBL_MANT_DIG+1 bits
-    if (a & ((tu_int)1 << DBL_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to DBL_MANT_DIG bits
-  } else {
-    a <<= (DBL_MANT_DIG - sd);
-    // a is now rounded to DBL_MANT_DIG bits
-  }
-  double_bits fb;
-  fb.u.s.high = ((su_int)s & 0x80000000) |        // sign
-                ((e + 1023) << 20) |              // exponent
-                ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high
-  fb.u.s.low = (su_int)a;                         // mantissa-low
-  return fb.f;
-}
+COMPILER_RT_ABI double __floattidf(ti_int a) { return __floatXiYf__(a); }
 
 #endif // CRT_HAS_128BIT
diff --git a/compiler-rt/lib/builtins/floattisf.c b/compiler-rt/lib/builtins/floattisf.c
index a8fcdbe..7758990 100644
--- a/compiler-rt/lib/builtins/floattisf.c
+++ b/compiler-rt/lib/builtins/floattisf.c
@@ -14,6 +14,10 @@
 
 #ifdef CRT_HAS_128BIT
 
+#define SRC_I128
+#define DST_SINGLE
+#include "int_to_fp_impl.inc"
+
 // Returns: convert a to a float, rounding toward even.
 
 // Assumption: float is a IEEE 32 bit floating point type
@@ -21,51 +25,6 @@
 
 // seee eeee emmm mmmm mmmm mmmm mmmm mmmm
 
-COMPILER_RT_ABI float __floattisf(ti_int a) {
-  if (a == 0)
-    return 0.0F;
-  const unsigned N = sizeof(ti_int) * CHAR_BIT;
-  const ti_int s = a >> (N - 1);
-  a = (a ^ s) - s;
-  int sd = N - __clzti2(a); // number of significant digits
-  int e = sd - 1;           // exponent
-  if (sd > FLT_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit FLT_MANT_DIG-1 bits to the right of 1
-    //  Q = bit FLT_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case FLT_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case FLT_MANT_DIG + 2:
-      break;
-    default:
-      a = ((tu_int)a >> (sd - (FLT_MANT_DIG + 2))) |
-          ((a & ((tu_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits
-    if (a & ((tu_int)1 << FLT_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to FLT_MANT_DIG bits
-  } else {
-    a <<= (FLT_MANT_DIG - sd);
-    // a is now rounded to FLT_MANT_DIG bits
-  }
-  float_bits fb;
-  fb.u = ((su_int)s & 0x80000000) | // sign
-         ((e + 127) << 23) |        // exponent
-         ((su_int)a & 0x007FFFFF);  // mantissa
-  return fb.f;
-}
+COMPILER_RT_ABI float __floattisf(ti_int a) { return __floatXiYf__(a); }
 
 #endif // CRT_HAS_128BIT
diff --git a/compiler-rt/lib/builtins/floattitf.c b/compiler-rt/lib/builtins/floattitf.c
index 196cbda..5dffe22 100644
--- a/compiler-rt/lib/builtins/floattitf.c
+++ b/compiler-rt/lib/builtins/floattitf.c
@@ -16,6 +16,11 @@
 #include "fp_lib.h"
 #include "int_lib.h"
 
+#if defined(CRT_HAS_TF_MODE)
+#define SRC_I128
+#define DST_QUAD
+#include "int_to_fp_impl.inc"
+
 // Returns: convert a ti_int to a fp_t, rounding toward even.
 
 // Assumption: fp_t is a IEEE 128 bit floating point type
@@ -25,54 +30,6 @@
 // mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
 // mmmm mmmm mmmm
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
-COMPILER_RT_ABI fp_t __floattitf(ti_int a) {
-  if (a == 0)
-    return 0.0;
-  const unsigned N = sizeof(ti_int) * CHAR_BIT;
-  const ti_int s = a >> (N - 1);
-  a = (a ^ s) - s;
-  int sd = N - __clzti2(a); // number of significant digits
-  int e = sd - 1;           // exponent
-  if (sd > LDBL_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit LDBL_MANT_DIG-1 bits to the right of 1
-    //  Q = bit LDBL_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case LDBL_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case LDBL_MANT_DIG + 2:
-      break;
-    default:
-      a = ((tu_int)a >> (sd - (LDBL_MANT_DIG + 2))) |
-          ((a & ((tu_int)(-1) >> ((N + LDBL_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to LDBL_MANT_DIG or LDBL_MANT_DIG+1 bits
-    if (a & ((tu_int)1 << LDBL_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to LDBL_MANT_DIG bits
-  } else {
-    a <<= (LDBL_MANT_DIG - sd);
-    // a is now rounded to LDBL_MANT_DIG bits
-  }
-
-  long_double_bits fb;
-  fb.u.high.all = (s & 0x8000000000000000LL)            // sign
-                  | (du_int)(e + 16383) << 48           // exponent
-                  | ((a >> 64) & 0x0000ffffffffffffLL); // significand
-  fb.u.low.all = (du_int)(a);
-  return fb.f;
-}
+COMPILER_RT_ABI fp_t __floattitf(ti_int a) { return __floatXiYf__(a); }
 
 #endif
diff --git a/compiler-rt/lib/builtins/floattixf.c b/compiler-rt/lib/builtins/floattixf.c
index 23796f1..c80bc71 100644
--- a/compiler-rt/lib/builtins/floattixf.c
+++ b/compiler-rt/lib/builtins/floattixf.c
@@ -23,7 +23,7 @@
 // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
 // mmmm mmmm mmmm
 
-COMPILER_RT_ABI long double __floattixf(ti_int a) {
+COMPILER_RT_ABI xf_float __floattixf(ti_int a) {
   if (a == 0)
     return 0.0;
   const unsigned N = sizeof(ti_int) * CHAR_BIT;
@@ -63,7 +63,7 @@ COMPILER_RT_ABI long double __floattixf(ti_int a) {
     a <<= (LDBL_MANT_DIG - sd);
     // a is now rounded to LDBL_MANT_DIG bits
   }
-  long_double_bits fb;
+  xf_bits fb;
   fb.u.high.s.low = ((su_int)s & 0x8000) | // sign
                     (e + 16383);           // exponent
   fb.u.low.all = (du_int)a;                // mantissa
diff --git a/compiler-rt/lib/builtins/floatundidf.c b/compiler-rt/lib/builtins/floatundidf.c
index 2ec802c..9743e96 100644
--- a/compiler-rt/lib/builtins/floatundidf.c
+++ b/compiler-rt/lib/builtins/floatundidf.c
@@ -51,50 +51,11 @@ COMPILER_RT_ABI double __floatundidf(du_int a) {
 // flags to set, and we don't want to code-gen to an unknown soft-float
 // implementation.
 
-COMPILER_RT_ABI double __floatundidf(du_int a) {
-  if (a == 0)
-    return 0.0;
-  const unsigned N = sizeof(du_int) * CHAR_BIT;
-  int sd = N - __builtin_clzll(a); // number of significant digits
-  int e = sd - 1;                  // exponent
-  if (sd > DBL_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit DBL_MANT_DIG-1 bits to the right of 1
-    //  Q = bit DBL_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case DBL_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case DBL_MANT_DIG + 2:
-      break;
-    default:
-      a = (a >> (sd - (DBL_MANT_DIG + 2))) |
-          ((a & ((du_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to DBL_MANT_DIG or DBL_MANT_DIG+1 bits
-    if (a & ((du_int)1 << DBL_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to DBL_MANT_DIG bits
-  } else {
-    a <<= (DBL_MANT_DIG - sd);
-    // a is now rounded to DBL_MANT_DIG bits
-  }
-  double_bits fb;
-  fb.u.s.high = ((su_int)(e + 1023) << 20) |      // exponent
-                ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high
-  fb.u.s.low = (su_int)a;                         // mantissa-low
-  return fb.f;
-}
+#define SRC_U64
+#define DST_DOUBLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI double __floatundidf(du_int a) { return __floatXiYf__(a); }
 #endif
 
 #if defined(__ARM_EABI__)
diff --git a/compiler-rt/lib/builtins/floatundisf.c b/compiler-rt/lib/builtins/floatundisf.c
index 2a4157d..d4b418e 100644
--- a/compiler-rt/lib/builtins/floatundisf.c
+++ b/compiler-rt/lib/builtins/floatundisf.c
@@ -19,49 +19,11 @@
 
 #include "int_lib.h"
 
-COMPILER_RT_ABI float __floatundisf(du_int a) {
-  if (a == 0)
-    return 0.0F;
-  const unsigned N = sizeof(du_int) * CHAR_BIT;
-  int sd = N - __builtin_clzll(a); // number of significant digits
-  si_int e = sd - 1;               // 8 exponent
-  if (sd > FLT_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit FLT_MANT_DIG-1 bits to the right of 1
-    //  Q = bit FLT_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case FLT_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case FLT_MANT_DIG + 2:
-      break;
-    default:
-      a = (a >> (sd - (FLT_MANT_DIG + 2))) |
-          ((a & ((du_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits
-    if (a & ((du_int)1 << FLT_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to FLT_MANT_DIG bits
-  } else {
-    a <<= (FLT_MANT_DIG - sd);
-    // a is now rounded to FLT_MANT_DIG bits
-  }
-  float_bits fb;
-  fb.u = ((e + 127) << 23) |       // exponent
-         ((su_int)a & 0x007FFFFF); // mantissa
-  return fb.f;
-}
+#define SRC_U64
+#define DST_SINGLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI float __floatundisf(du_int a) { return __floatXiYf__(a); }
 
 #if defined(__ARM_EABI__)
 #if defined(COMPILER_RT_ARMHF_TARGET)
diff --git a/compiler-rt/lib/builtins/floatunditf.c b/compiler-rt/lib/builtins/floatunditf.c
index 8d31085..abe0ca9 100644
--- a/compiler-rt/lib/builtins/floatunditf.c
+++ b/compiler-rt/lib/builtins/floatunditf.c
@@ -15,7 +15,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 COMPILER_RT_ABI fp_t __floatunditf(du_int a) {
 
   const int aWidth = sizeof a * CHAR_BIT;
diff --git a/compiler-rt/lib/builtins/floatundixf.c b/compiler-rt/lib/builtins/floatundixf.c
index 85264ad..3e3c655 100644
--- a/compiler-rt/lib/builtins/floatundixf.c
+++ b/compiler-rt/lib/builtins/floatundixf.c
@@ -22,13 +22,13 @@
 // gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
 // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
 // mmmm mmmm mmmm
-COMPILER_RT_ABI long double __floatundixf(du_int a) {
+COMPILER_RT_ABI xf_float __floatundixf(du_int a) {
   if (a == 0)
     return 0.0;
   const unsigned N = sizeof(du_int) * CHAR_BIT;
   int clz = __builtin_clzll(a);
   int e = (N - 1) - clz; // exponent
-  long_double_bits fb;
+  xf_bits fb;
   fb.u.high.s.low = (e + 16383); // exponent
   fb.u.low.all = a << clz;       // mantissa
   return fb.f;
diff --git a/compiler-rt/lib/builtins/floatunsitf.c b/compiler-rt/lib/builtins/floatunsitf.c
index 7ba1fb6..3f0a524 100644
--- a/compiler-rt/lib/builtins/floatunsitf.c
+++ b/compiler-rt/lib/builtins/floatunsitf.c
@@ -15,7 +15,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 COMPILER_RT_ABI fp_t __floatunsitf(su_int a) {
 
   const int aWidth = sizeof a * CHAR_BIT;
diff --git a/compiler-rt/lib/builtins/floatuntidf.c b/compiler-rt/lib/builtins/floatuntidf.c
index e69e65c..9abeacc 100644
--- a/compiler-rt/lib/builtins/floatuntidf.c
+++ b/compiler-rt/lib/builtins/floatuntidf.c
@@ -14,6 +14,10 @@
 
 #ifdef CRT_HAS_128BIT
 
+#define SRC_U128
+#define DST_DOUBLE
+#include "int_to_fp_impl.inc"
+
 // Returns: convert a to a double, rounding toward even.
 
 // Assumption: double is a IEEE 64 bit floating point type
@@ -22,49 +26,6 @@
 // seee eeee eeee mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm
 // mmmm
 
-COMPILER_RT_ABI double __floatuntidf(tu_int a) {
-  if (a == 0)
-    return 0.0;
-  const unsigned N = sizeof(tu_int) * CHAR_BIT;
-  int sd = N - __clzti2(a); // number of significant digits
-  int e = sd - 1;           // exponent
-  if (sd > DBL_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit DBL_MANT_DIG-1 bits to the right of 1
-    //  Q = bit DBL_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case DBL_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case DBL_MANT_DIG + 2:
-      break;
-    default:
-      a = (a >> (sd - (DBL_MANT_DIG + 2))) |
-          ((a & ((tu_int)(-1) >> ((N + DBL_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to DBL_MANT_DIG or DBL_MANT_DIG+1 bits
-    if (a & ((tu_int)1 << DBL_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to DBL_MANT_DIG bits
-  } else {
-    a <<= (DBL_MANT_DIG - sd);
-    // a is now rounded to DBL_MANT_DIG bits
-  }
-  double_bits fb;
-  fb.u.s.high = ((e + 1023) << 20) |              // exponent
-                ((su_int)(a >> 32) & 0x000FFFFF); // mantissa-high
-  fb.u.s.low = (su_int)a;                         // mantissa-low
-  return fb.f;
-}
+COMPILER_RT_ABI double __floatuntidf(tu_int a) { return __floatXiYf__(a); }
 
 #endif // CRT_HAS_128BIT
diff --git a/compiler-rt/lib/builtins/floatuntisf.c b/compiler-rt/lib/builtins/floatuntisf.c
index 9dec0ab..997c156 100644
--- a/compiler-rt/lib/builtins/floatuntisf.c
+++ b/compiler-rt/lib/builtins/floatuntisf.c
@@ -14,6 +14,10 @@
 
 #ifdef CRT_HAS_128BIT
 
+#define SRC_U128
+#define DST_SINGLE
+#include "int_to_fp_impl.inc"
+
 // Returns: convert a to a float, rounding toward even.
 
 // Assumption: float is a IEEE 32 bit floating point type
@@ -21,48 +25,6 @@
 
 // seee eeee emmm mmmm mmmm mmmm mmmm mmmm
 
-COMPILER_RT_ABI float __floatuntisf(tu_int a) {
-  if (a == 0)
-    return 0.0F;
-  const unsigned N = sizeof(tu_int) * CHAR_BIT;
-  int sd = N - __clzti2(a); // number of significant digits
-  int e = sd - 1;           // exponent
-  if (sd > FLT_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit FLT_MANT_DIG-1 bits to the right of 1
-    //  Q = bit FLT_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case FLT_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case FLT_MANT_DIG + 2:
-      break;
-    default:
-      a = (a >> (sd - (FLT_MANT_DIG + 2))) |
-          ((a & ((tu_int)(-1) >> ((N + FLT_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to FLT_MANT_DIG or FLT_MANT_DIG+1 bits
-    if (a & ((tu_int)1 << FLT_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to FLT_MANT_DIG bits
-  } else {
-    a <<= (FLT_MANT_DIG - sd);
-    // a is now rounded to FLT_MANT_DIG bits
-  }
-  float_bits fb;
-  fb.u = ((e + 127) << 23) |       // exponent
-         ((su_int)a & 0x007FFFFF); // mantissa
-  return fb.f;
-}
+COMPILER_RT_ABI float __floatuntisf(tu_int a) { return __floatXiYf__(a); }
 
 #endif // CRT_HAS_128BIT
diff --git a/compiler-rt/lib/builtins/floatuntitf.c b/compiler-rt/lib/builtins/floatuntitf.c
index d308d31..1c5998a 100644
--- a/compiler-rt/lib/builtins/floatuntitf.c
+++ b/compiler-rt/lib/builtins/floatuntitf.c
@@ -16,6 +16,11 @@
 #include "fp_lib.h"
 #include "int_lib.h"
 
+#if defined(CRT_HAS_TF_MODE)
+#define SRC_U128
+#define DST_QUAD
+#include "int_to_fp_impl.inc"
+
 // Returns: convert a tu_int to a fp_t, rounding toward even.
 
 // Assumption: fp_t is a IEEE 128 bit floating point type
@@ -25,51 +30,6 @@
 // mmmm | mmmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
 // mmmm mmmm mmmm
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
-COMPILER_RT_ABI fp_t __floatuntitf(tu_int a) {
-  if (a == 0)
-    return 0.0;
-  const unsigned N = sizeof(tu_int) * CHAR_BIT;
-  int sd = N - __clzti2(a); // number of significant digits
-  int e = sd - 1;           // exponent
-  if (sd > LDBL_MANT_DIG) {
-    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
-    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
-    //                                                12345678901234567890123456
-    //  1 = msb 1 bit
-    //  P = bit LDBL_MANT_DIG-1 bits to the right of 1
-    //  Q = bit LDBL_MANT_DIG bits to the right of 1
-    //  R = "or" of all bits to the right of Q
-    switch (sd) {
-    case LDBL_MANT_DIG + 1:
-      a <<= 1;
-      break;
-    case LDBL_MANT_DIG + 2:
-      break;
-    default:
-      a = (a >> (sd - (LDBL_MANT_DIG + 2))) |
-          ((a & ((tu_int)(-1) >> ((N + LDBL_MANT_DIG + 2) - sd))) != 0);
-    };
-    // finish:
-    a |= (a & 4) != 0; // Or P into R
-    ++a;               // round - this step may add a significant bit
-    a >>= 2;           // dump Q and R
-    // a is now rounded to LDBL_MANT_DIG or LDBL_MANT_DIG+1 bits
-    if (a & ((tu_int)1 << LDBL_MANT_DIG)) {
-      a >>= 1;
-      ++e;
-    }
-    // a is now rounded to LDBL_MANT_DIG bits
-  } else {
-    a <<= (LDBL_MANT_DIG - sd);
-    // a is now rounded to LDBL_MANT_DIG bits
-  }
-
-  long_double_bits fb;
-  fb.u.high.all = (du_int)(e + 16383) << 48             // exponent
-                  | ((a >> 64) & 0x0000ffffffffffffLL); // significand
-  fb.u.low.all = (du_int)(a);
-  return fb.f;
-}
+COMPILER_RT_ABI fp_t __floatuntitf(tu_int a) { return __floatXiYf__(a); }
 
 #endif
diff --git a/compiler-rt/lib/builtins/floatuntixf.c b/compiler-rt/lib/builtins/floatuntixf.c
index efd8a27..4c53775 100644
--- a/compiler-rt/lib/builtins/floatuntixf.c
+++ b/compiler-rt/lib/builtins/floatuntixf.c
@@ -23,7 +23,7 @@
 // eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
 // mmmm mmmm mmmm
 
-COMPILER_RT_ABI long double __floatuntixf(tu_int a) {
+COMPILER_RT_ABI xf_float __floatuntixf(tu_int a) {
   if (a == 0)
     return 0.0;
   const unsigned N = sizeof(tu_int) * CHAR_BIT;
@@ -61,7 +61,7 @@ COMPILER_RT_ABI long double __floatuntixf(tu_int a) {
     a <<= (LDBL_MANT_DIG - sd);
     // a is now rounded to LDBL_MANT_DIG bits
   }
-  long_double_bits fb;
+  xf_bits fb;
   fb.u.high.s.low = (e + 16383); // exponent
   fb.u.low.all = (du_int)a;      // mantissa
   return fb.f;
diff --git a/compiler-rt/lib/builtins/fp_add_impl.inc b/compiler-rt/lib/builtins/fp_add_impl.inc
index 7133358..d205999 100644
--- a/compiler-rt/lib/builtins/fp_add_impl.inc
+++ b/compiler-rt/lib/builtins/fp_add_impl.inc
@@ -91,7 +91,7 @@ static __inline fp_t __addXf3__(fp_t a, fp_t b) {
 
   // Shift the significand of b by the difference in exponents, with a sticky
   // bottom bit to get rounding correct.
-  const unsigned int align = aExponent - bExponent;
+  const unsigned int align = (unsigned int)(aExponent - bExponent);
   if (align) {
     if (align < typeWidth) {
       const bool sticky = (bSignificand << (typeWidth - align)) != 0;
diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h
index eee4722..22bf2b2 100644
--- a/compiler-rt/lib/builtins/fp_extend.h
+++ b/compiler-rt/lib/builtins/fp_extend.h
@@ -20,24 +20,37 @@
 typedef float src_t;
 typedef uint32_t src_rep_t;
 #define SRC_REP_C UINT32_C
-static const int srcSigBits = 23;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 23;
+// -1 accounts for the sign bit.
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 8;
 #define src_rep_t_clz clzsi
 
 #elif defined SRC_DOUBLE
 typedef double src_t;
 typedef uint64_t src_rep_t;
 #define SRC_REP_C UINT64_C
-static const int srcSigBits = 52;
-static __inline int src_rep_t_clz(src_rep_t a) {
-#if defined __LP64__
-  return __builtin_clzl(a);
-#else
-  if (a & REP_C(0xffffffff00000000))
-    return clzsi(a >> 32);
-  else
-    return 32 + clzsi(a & REP_C(0xffffffff));
-#endif
-}
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 52;
+// -1 accounts for the sign bit.
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 11;
+
+static inline int src_rep_t_clz_impl(src_rep_t a) { return __builtin_clzll(a); }
+#define src_rep_t_clz src_rep_t_clz_impl
+
+#elif defined SRC_80
+typedef xf_float src_t;
+typedef __uint128_t src_rep_t;
+#define SRC_REP_C (__uint128_t)
+// sign bit, exponent and significand occupy the lower 80 bits.
+static const int srcBits = 80;
+static const int srcSigFracBits = 63;
+// -1 accounts for the sign bit.
+// -1 accounts for the explicitly stored integer bit.
+// srcBits - srcSigFracBits - 1 - 1
+static const int srcExpBits = 15;
 
 #elif defined SRC_HALF
 #ifdef COMPILER_RT_HAS_FLOAT16
@@ -47,7 +60,31 @@ typedef uint16_t src_t;
 #endif
 typedef uint16_t src_rep_t;
 #define SRC_REP_C UINT16_C
-static const int srcSigBits = 10;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 10;
+// -1 accounts for the sign bit.
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 5;
+
+static inline int src_rep_t_clz_impl(src_rep_t a) {
+  return __builtin_clz(a) - 16;
+}
+
+#define src_rep_t_clz src_rep_t_clz_impl
+
+#elif defined SRC_BFLOAT16
+#ifdef COMPILER_RT_HAS_BFLOAT16
+typedef __bf16 src_t;
+#else
+typedef uint16_t src_t;
+#endif
+typedef uint16_t src_rep_t;
+#define SRC_REP_C UINT16_C
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 7;
+// -1 accounts for the sign bit.
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 8;
 #define src_rep_t_clz __builtin_clz
 
 #else
@@ -58,28 +95,72 @@ static const int srcSigBits = 10;
 typedef float dst_t;
 typedef uint32_t dst_rep_t;
 #define DST_REP_C UINT32_C
-static const int dstSigBits = 23;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 23;
+// -1 accounts for the sign bit.
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 8;
 
 #elif defined DST_DOUBLE
 typedef double dst_t;
 typedef uint64_t dst_rep_t;
 #define DST_REP_C UINT64_C
-static const int dstSigBits = 52;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 52;
+// -1 accounts for the sign bit.
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 11;
 
 #elif defined DST_QUAD
-typedef long double dst_t;
+typedef tf_float dst_t;
 typedef __uint128_t dst_rep_t;
 #define DST_REP_C (__uint128_t)
-static const int dstSigBits = 112;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 112;
+// -1 accounts for the sign bit.
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 15;
 
 #else
 #error Destination should be single, double, or quad precision!
 #endif // end destination precision
 
-// End of specialization parameters.  Two helper routines for conversion to and
-// from the representation of floating-point data as integer values follow.
+// End of specialization parameters.
+
+// TODO: These helper routines should be placed into fp_lib.h
+// Currently they depend on macros/constants defined above.
+
+static inline src_rep_t extract_sign_from_src(src_rep_t x) {
+  const src_rep_t srcSignMask = SRC_REP_C(1) << (srcBits - 1);
+  return (x & srcSignMask) >> (srcBits - 1);
+}
+
+static inline src_rep_t extract_exp_from_src(src_rep_t x) {
+  const int srcSigBits = srcBits - 1 - srcExpBits;
+  const src_rep_t srcExpMask = ((SRC_REP_C(1) << srcExpBits) - 1) << srcSigBits;
+  return (x & srcExpMask) >> srcSigBits;
+}
+
+static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) {
+  const src_rep_t srcSigFracMask = (SRC_REP_C(1) << srcSigFracBits) - 1;
+  return x & srcSigFracMask;
+}
+
+#ifdef src_rep_t_clz
+static inline int clz_in_sig_frac(src_rep_t sigFrac) {
+      const int skip = 1 + srcExpBits;
+      return src_rep_t_clz(sigFrac) - skip;
+}
+#endif
+
+static inline dst_rep_t construct_dst_rep(dst_rep_t sign, dst_rep_t exp, dst_rep_t sigFrac) {
+  return (sign << (dstBits - 1)) | (exp << (dstBits - 1 - dstExpBits)) | sigFrac;
+}
+
+// Two helper routines for conversion to and from the representation of
+// floating-point data as integer values follow.
 
-static __inline src_rep_t srcToRep(src_t x) {
+static inline src_rep_t srcToRep(src_t x) {
   const union {
     src_t f;
     src_rep_t i;
@@ -87,7 +168,7 @@ static __inline src_rep_t srcToRep(src_t x) {
   return rep.i;
 }
 
-static __inline dst_t dstFromRep(dst_rep_t x) {
+static inline dst_t dstFromRep(dst_rep_t x) {
   const union {
     dst_t f;
     dst_rep_t i;
diff --git a/compiler-rt/lib/builtins/fp_extend_impl.inc b/compiler-rt/lib/builtins/fp_extend_impl.inc
index d1c9c02..f4f6630 100644
--- a/compiler-rt/lib/builtins/fp_extend_impl.inc
+++ b/compiler-rt/lib/builtins/fp_extend_impl.inc
@@ -37,71 +37,72 @@
 
 #include "fp_extend.h"
 
+// The source type may use a usual IEEE-754 interchange format or Intel 80-bit
+// format. In particular, for the source type srcSigFracBits may be not equal to
+// srcSigBits. The destination type is assumed to be one of IEEE-754 standard
+// types.
 static __inline dst_t __extendXfYf2__(src_t a) {
   // Various constants whose values follow from the type parameters.
   // Any reasonable optimizer will fold and propagate all of these.
-  const int srcBits = sizeof(src_t) * CHAR_BIT;
-  const int srcExpBits = srcBits - srcSigBits - 1;
   const int srcInfExp = (1 << srcExpBits) - 1;
   const int srcExpBias = srcInfExp >> 1;
 
-  const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigBits;
-  const src_rep_t srcInfinity = (src_rep_t)srcInfExp << srcSigBits;
-  const src_rep_t srcSignMask = SRC_REP_C(1) << (srcSigBits + srcExpBits);
-  const src_rep_t srcAbsMask = srcSignMask - 1;
-  const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigBits - 1);
-  const src_rep_t srcNaNCode = srcQNaN - 1;
-
-  const int dstBits = sizeof(dst_t) * CHAR_BIT;
-  const int dstExpBits = dstBits - dstSigBits - 1;
   const int dstInfExp = (1 << dstExpBits) - 1;
   const int dstExpBias = dstInfExp >> 1;
 
-  const dst_rep_t dstMinNormal = DST_REP_C(1) << dstSigBits;
-
   // Break a into a sign and representation of the absolute value.
   const src_rep_t aRep = srcToRep(a);
-  const src_rep_t aAbs = aRep & srcAbsMask;
-  const src_rep_t sign = aRep & srcSignMask;
-  dst_rep_t absResult;
+  const src_rep_t srcSign = extract_sign_from_src(aRep);
+  const src_rep_t srcExp = extract_exp_from_src(aRep);
+  const src_rep_t srcSigFrac = extract_sig_frac_from_src(aRep);
+
+  dst_rep_t dstSign = srcSign;
+  dst_rep_t dstExp;
+  dst_rep_t dstSigFrac;
 
-  // If sizeof(src_rep_t) < sizeof(int), the subtraction result is promoted
-  // to (signed) int.  To avoid that, explicitly cast to src_rep_t.
-  if ((src_rep_t)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) {
+  if (srcExp >= 1 && srcExp < (src_rep_t)srcInfExp) {
     // a is a normal number.
-    // Extend to the destination type by shifting the significand and
-    // exponent into the proper position and rebiasing the exponent.
-    absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits);
-    absResult += (dst_rep_t)(dstExpBias - srcExpBias) << dstSigBits;
+    dstExp = (dst_rep_t)srcExp + (dst_rep_t)(dstExpBias - srcExpBias);
+    dstSigFrac = (dst_rep_t)srcSigFrac << (dstSigFracBits - srcSigFracBits);
   }
 
-  else if (aAbs >= srcInfinity) {
+  else if (srcExp == srcInfExp) {
     // a is NaN or infinity.
-    // Conjure the result by beginning with infinity, then setting the qNaN
-    // bit (if needed) and right-aligning the rest of the trailing NaN
-    // payload field.
-    absResult = (dst_rep_t)dstInfExp << dstSigBits;
-    absResult |= (dst_rep_t)(aAbs & srcQNaN) << (dstSigBits - srcSigBits);
-    absResult |= (dst_rep_t)(aAbs & srcNaNCode) << (dstSigBits - srcSigBits);
+    dstExp = dstInfExp;
+    dstSigFrac = (dst_rep_t)srcSigFrac << (dstSigFracBits - srcSigFracBits);
   }
 
-  else if (aAbs) {
+  else if (srcSigFrac) {
     // a is denormal.
-    // renormalize the significand and clear the leading bit, then insert
-    // the correct adjusted exponent in the destination type.
-    const int scale = src_rep_t_clz(aAbs) - src_rep_t_clz(srcMinNormal);
-    absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits + scale);
-    absResult ^= dstMinNormal;
-    const int resultExponent = dstExpBias - srcExpBias - scale + 1;
-    absResult |= (dst_rep_t)resultExponent << dstSigBits;
+    if (srcExpBits == dstExpBits) {
+      // The exponent fields are identical and this is a denormal number, so all
+      // the non-significand bits are zero. In particular, this branch is always
+      // taken when we extend a denormal F80 to F128.
+      dstExp = 0;
+      dstSigFrac = ((dst_rep_t)srcSigFrac) << (dstSigFracBits - srcSigFracBits);
+    } else {
+#ifndef src_rep_t_clz
+      // If src_rep_t_clz is not defined this branch must be unreachable.
+      __builtin_unreachable();
+#else
+      // Renormalize the significand and clear the leading bit.
+      // For F80 -> F128 this codepath is unused.
+      const int scale = clz_in_sig_frac(srcSigFrac) + 1;
+      dstExp = dstExpBias - srcExpBias - scale + 1;
+      dstSigFrac = (dst_rep_t)srcSigFrac
+                   << (dstSigFracBits - srcSigFracBits + scale);
+      const dst_rep_t dstMinNormal = DST_REP_C(1) << (dstBits - 1 - dstExpBits);
+      dstSigFrac ^= dstMinNormal;
+#endif
+    }
   }
 
   else {
     // a is zero.
-    absResult = 0;
+    dstExp = 0;
+    dstSigFrac = 0;
   }
 
-  // Apply the signbit to the absolute value.
-  const dst_rep_t result = absResult | (dst_rep_t)sign << (dstBits - srcBits);
+  const dst_rep_t result = construct_dst_rep(dstSign, dstExp, dstSigFrac);
   return dstFromRep(result);
 }
diff --git a/compiler-rt/lib/builtins/fp_fixint_impl.inc b/compiler-rt/lib/builtins/fp_fixint_impl.inc
index 2196d71..2f2f77c 100644
--- a/compiler-rt/lib/builtins/fp_fixint_impl.inc
+++ b/compiler-rt/lib/builtins/fp_fixint_impl.inc
@@ -34,7 +34,7 @@ static __inline fixint_t __fixint(fp_t a) {
   // If 0 <= exponent < significandBits, right shift to get the result.
   // Otherwise, shift left.
   if (exponent < significandBits)
-    return sign * (significand >> (significandBits - exponent));
+    return (fixint_t)(sign * (significand >> (significandBits - exponent)));
   else
-    return sign * ((fixint_t)significand << (exponent - significandBits));
+    return (fixint_t)(sign * ((fixuint_t)significand << (exponent - significandBits)));
 }
diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h
index 3fb13a0..b2a8950 100644
--- a/compiler-rt/lib/builtins/fp_lib.h
+++ b/compiler-rt/lib/builtins/fp_lib.h
@@ -22,22 +22,11 @@
 
 #include "int_lib.h"
 #include "int_math.h"
+#include "int_types.h"
 #include <limits.h>
 #include <stdbool.h>
 #include <stdint.h>
 
-// x86_64 FreeBSD prior v9.3 define fixed-width types incorrectly in
-// 32-bit mode.
-#if defined(__FreeBSD__) && defined(__i386__)
-#include <sys/param.h>
-#if __FreeBSD_version < 903000 // v9.3
-#define uint64_t unsigned long long
-#define int64_t long long
-#undef UINT64_C
-#define UINT64_C(c) (c##ULL)
-#endif
-#endif
-
 #if defined SINGLE_PRECISION
 
 typedef uint16_t half_rep_t;
@@ -54,8 +43,8 @@ static __inline int rep_clz(rep_t a) { return clzsi(a); }
 // 32x32 --> 64 bit multiply
 static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
   const uint64_t product = (uint64_t)a * b;
-  *hi = product >> 32;
-  *lo = product;
+  *hi = (rep_t)(product >> 32);
+  *lo = (rep_t)product;
 }
 COMPILER_RT_ABI fp_t __addsf3(fp_t a, fp_t b);
 
@@ -69,16 +58,7 @@ typedef double fp_t;
 #define REP_C UINT64_C
 #define significandBits 52
 
-static __inline int rep_clz(rep_t a) {
-#if defined __LP64__
-  return __builtin_clzl(a);
-#else
-  if (a & REP_C(0xffffffff00000000))
-    return clzsi(a >> 32);
-  else
-    return 32 + clzsi(a & REP_C(0xffffffff));
-#endif
-}
+static inline int rep_clz(rep_t a) { return __builtin_clzll(a); }
 
 #define loWord(a) (a & 0xffffffffU)
 #define hiWord(a) (a >> 32)
@@ -105,17 +85,18 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 COMPILER_RT_ABI fp_t __adddf3(fp_t a, fp_t b);
 
 #elif defined QUAD_PRECISION
-#if __LDBL_MANT_DIG__ == 113 && defined(__SIZEOF_INT128__)
-#define CRT_LDBL_128BIT
+#if defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT)
 typedef uint64_t half_rep_t;
 typedef __uint128_t rep_t;
 typedef __int128_t srep_t;
-typedef long double fp_t;
+typedef tf_float fp_t;
 #define HALF_REP_C UINT64_C
 #define REP_C (__uint128_t)
+#if defined(CRT_HAS_IEEE_TF)
 // Note: Since there is no explicit way to tell compiler the constant is a
 // 128-bit integer, we let the constant be casted to 128-bit integer
 #define significandBits 112
+#define TF_MANT_DIG (significandBits + 1)
 
 static __inline int rep_clz(rep_t a) {
   const union {
@@ -200,27 +181,17 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 #undef Word_HiMask
 #undef Word_LoMask
 #undef Word_FullMask
-#endif // __LDBL_MANT_DIG__ == 113 && __SIZEOF_INT128__
+#endif // defined(CRT_HAS_IEEE_TF)
+#else
+typedef long double fp_t;
+#endif // defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT)
 #else
 #error SINGLE_PRECISION, DOUBLE_PRECISION or QUAD_PRECISION must be defined.
 #endif
 
 #if defined(SINGLE_PRECISION) || defined(DOUBLE_PRECISION) ||                  \
-    defined(CRT_LDBL_128BIT)
+    (defined(QUAD_PRECISION) && defined(CRT_HAS_TF_MODE))
 #define typeWidth (sizeof(rep_t) * CHAR_BIT)
-#define exponentBits (typeWidth - significandBits - 1)
-#define maxExponent ((1 << exponentBits) - 1)
-#define exponentBias (maxExponent >> 1)
-
-#define implicitBit (REP_C(1) << significandBits)
-#define significandMask (implicitBit - 1U)
-#define signBit (REP_C(1) << (significandBits + exponentBits))
-#define absMask (signBit - 1U)
-#define exponentMask (absMask ^ significandMask)
-#define oneRep ((rep_t)exponentBias << significandBits)
-#define infRep exponentMask
-#define quietBit (implicitBit >> 1)
-#define qnanRep (exponentMask | quietBit)
 
 static __inline rep_t toRep(fp_t x) {
   const union {
@@ -238,13 +209,28 @@ static __inline fp_t fromRep(rep_t x) {
   return rep.f;
 }
 
+#if !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF)
+#define exponentBits (typeWidth - significandBits - 1)
+#define maxExponent ((1 << exponentBits) - 1)
+#define exponentBias (maxExponent >> 1)
+
+#define implicitBit (REP_C(1) << significandBits)
+#define significandMask (implicitBit - 1U)
+#define signBit (REP_C(1) << (significandBits + exponentBits))
+#define absMask (signBit - 1U)
+#define exponentMask (absMask ^ significandMask)
+#define oneRep ((rep_t)exponentBias << significandBits)
+#define infRep exponentMask
+#define quietBit (implicitBit >> 1)
+#define qnanRep (exponentMask | quietBit)
+
 static __inline int normalize(rep_t *significand) {
   const int shift = rep_clz(*significand) - rep_clz(implicitBit);
   *significand <<= shift;
   return 1 - shift;
 }
 
-static __inline void wideLeftShift(rep_t *hi, rep_t *lo, int count) {
+static __inline void wideLeftShift(rep_t *hi, rep_t *lo, unsigned int count) {
   *hi = *hi << count | *lo >> (typeWidth - count);
   *lo = *lo << count;
 }
@@ -340,6 +326,8 @@ static __inline fp_t __compiler_rt_scalbnX(fp_t x, int y) {
     return fromRep(sign | ((rep_t)exp << significandBits) | sig);
 }
 
+#endif // !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF)
+
 // Avoid using fmax from libm.
 static __inline fp_t __compiler_rt_fmaxX(fp_t x, fp_t y) {
   // If either argument is NaN, return the other argument. If both are NaN,
@@ -386,31 +374,42 @@ static __inline fp_t __compiler_rt_fmax(fp_t x, fp_t y) {
 #endif
 }
 
-#elif defined(QUAD_PRECISION)
-
-#if defined(CRT_LDBL_128BIT)
-static __inline fp_t __compiler_rt_logbl(fp_t x) {
+#elif defined(QUAD_PRECISION) && defined(CRT_HAS_TF_MODE)
+// The generic implementation only works for ieee754 floating point. For other
+// floating point types, continue to rely on the libm implementation for now.
+#if defined(CRT_HAS_IEEE_TF)
+static __inline tf_float __compiler_rt_logbtf(tf_float x) {
   return __compiler_rt_logbX(x);
 }
-static __inline fp_t __compiler_rt_scalbnl(fp_t x, int y) {
+static __inline tf_float __compiler_rt_scalbntf(tf_float x, int y) {
   return __compiler_rt_scalbnX(x, y);
 }
-static __inline fp_t __compiler_rt_fmaxl(fp_t x, fp_t y) {
+static __inline tf_float __compiler_rt_fmaxtf(tf_float x, tf_float y) {
   return __compiler_rt_fmaxX(x, y);
 }
-#else
-// The generic implementation only works for ieee754 floating point. For other
-// floating point types, continue to rely on the libm implementation for now.
-static __inline long double __compiler_rt_logbl(long double x) {
+#define __compiler_rt_logbl __compiler_rt_logbtf
+#define __compiler_rt_scalbnl __compiler_rt_scalbntf
+#define __compiler_rt_fmaxl __compiler_rt_fmaxtf
+#define crt_fabstf crt_fabsf128
+#define crt_copysigntf crt_copysignf128
+#elif defined(CRT_LDBL_128BIT)
+static __inline tf_float __compiler_rt_logbtf(tf_float x) {
   return crt_logbl(x);
 }
-static __inline long double __compiler_rt_scalbnl(long double x, int y) {
+static __inline tf_float __compiler_rt_scalbntf(tf_float x, int y) {
   return crt_scalbnl(x, y);
 }
-static __inline long double __compiler_rt_fmaxl(long double x, long double y) {
+static __inline tf_float __compiler_rt_fmaxtf(tf_float x, tf_float y) {
   return crt_fmaxl(x, y);
 }
-#endif // CRT_LDBL_128BIT
+#define __compiler_rt_logbl crt_logbl
+#define __compiler_rt_scalbnl crt_scalbnl
+#define __compiler_rt_fmaxl crt_fmaxl
+#define crt_fabstf crt_fabsl
+#define crt_copysigntf crt_copysignl
+#else
+#error Unsupported TF mode type
+#endif
 
 #endif // *_PRECISION
 
diff --git a/compiler-rt/lib/builtins/fp_trunc.h b/compiler-rt/lib/builtins/fp_trunc.h
index 91f6145..141fe63 100644
--- a/compiler-rt/lib/builtins/fp_trunc.h
+++ b/compiler-rt/lib/builtins/fp_trunc.h
@@ -19,19 +19,31 @@
 typedef float src_t;
 typedef uint32_t src_rep_t;
 #define SRC_REP_C UINT32_C
-static const int srcSigBits = 23;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 23;
+// -1 accounts for the sign bit.
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 8;
 
 #elif defined SRC_DOUBLE
 typedef double src_t;
 typedef uint64_t src_rep_t;
 #define SRC_REP_C UINT64_C
-static const int srcSigBits = 52;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 52;
+// -1 accounts for the sign bit.
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 11;
 
 #elif defined SRC_QUAD
-typedef long double src_t;
+typedef tf_float src_t;
 typedef __uint128_t src_rep_t;
 #define SRC_REP_C (__uint128_t)
-static const int srcSigBits = 112;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 112;
+// -1 accounts for the sign bit.
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 15;
 
 #else
 #error Source should be double precision or quad precision!
@@ -41,13 +53,32 @@ static const int srcSigBits = 112;
 typedef double dst_t;
 typedef uint64_t dst_rep_t;
 #define DST_REP_C UINT64_C
-static const int dstSigBits = 52;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 52;
+// -1 accounts for the sign bit.
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 11;
+
+#elif defined DST_80
+typedef xf_float dst_t;
+typedef __uint128_t dst_rep_t;
+#define DST_REP_C (__uint128_t)
+static const int dstBits = 80;
+static const int dstSigFracBits = 63;
+// -1 accounts for the sign bit.
+// -1 accounts for the explicitly stored integer bit.
+// dstBits - dstSigFracBits - 1 - 1
+static const int dstExpBits = 15;
 
 #elif defined DST_SINGLE
 typedef float dst_t;
 typedef uint32_t dst_rep_t;
 #define DST_REP_C UINT32_C
-static const int dstSigBits = 23;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 23;
+// -1 accounts for the sign bit.
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 8;
 
 #elif defined DST_HALF
 #ifdef COMPILER_RT_HAS_FLOAT16
@@ -57,22 +88,58 @@ typedef uint16_t dst_t;
 #endif
 typedef uint16_t dst_rep_t;
 #define DST_REP_C UINT16_C
-static const int dstSigBits = 10;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 10;
+// -1 accounts for the sign bit.
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 5;
 
 #elif defined DST_BFLOAT
 typedef __bf16 dst_t;
 typedef uint16_t dst_rep_t;
 #define DST_REP_C UINT16_C
-static const int dstSigBits = 7;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 7;
+// -1 accounts for the sign bit.
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 8;
 
 #else
 #error Destination should be single precision or double precision!
 #endif // end destination precision
 
+// TODO: These helper routines should be placed into fp_lib.h
+// Currently they depend on macros/constants defined above.
+
+static inline src_rep_t extract_sign_from_src(src_rep_t x) {
+  const src_rep_t srcSignMask = SRC_REP_C(1) << (srcBits - 1);
+  return (x & srcSignMask) >> (srcBits - 1);
+}
+
+static inline src_rep_t extract_exp_from_src(src_rep_t x) {
+  const int srcSigBits = srcBits - 1 - srcExpBits;
+  const src_rep_t srcExpMask = ((SRC_REP_C(1) << srcExpBits) - 1) << srcSigBits;
+  return (x & srcExpMask) >> srcSigBits;
+}
+
+static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) {
+  const src_rep_t srcSigFracMask = (SRC_REP_C(1) << srcSigFracBits) - 1;
+  return x & srcSigFracMask;
+}
+
+static inline dst_rep_t construct_dst_rep(dst_rep_t sign, dst_rep_t exp, dst_rep_t sigFrac) {
+  dst_rep_t result = (sign << (dstBits - 1)) | (exp << (dstBits - 1 - dstExpBits)) | sigFrac;
+  // Set the explicit integer bit in F80 if present.
+  if (dstBits == 80 && exp) {
+    result |= (DST_REP_C(1) << dstSigFracBits);
+  }
+  return result;
+}
+
 // End of specialization parameters.  Two helper routines for conversion to and
 // from the representation of floating-point data as integer values follow.
 
-static __inline src_rep_t srcToRep(src_t x) {
+static inline src_rep_t srcToRep(src_t x) {
   const union {
     src_t f;
     src_rep_t i;
@@ -80,7 +147,7 @@ static __inline src_rep_t srcToRep(src_t x) {
   return rep.i;
 }
 
-static __inline dst_t dstFromRep(dst_rep_t x) {
+static inline dst_t dstFromRep(dst_rep_t x) {
   const union {
     dst_t f;
     dst_rep_t i;
diff --git a/compiler-rt/lib/builtins/fp_trunc_impl.inc b/compiler-rt/lib/builtins/fp_trunc_impl.inc
index 6662be7..f684924 100644
--- a/compiler-rt/lib/builtins/fp_trunc_impl.inc
+++ b/compiler-rt/lib/builtins/fp_trunc_impl.inc
@@ -38,95 +38,118 @@
 
 #include "fp_trunc.h"
 
+// The destination type may use a usual IEEE-754 interchange format or Intel
+// 80-bit format. In particular, for the destination type dstSigFracBits may be
+// not equal to dstSigBits. The source type is assumed to be one of IEEE-754
+// standard types.
 static __inline dst_t __truncXfYf2__(src_t a) {
   // Various constants whose values follow from the type parameters.
   // Any reasonable optimizer will fold and propagate all of these.
-  const int srcBits = sizeof(src_t) * CHAR_BIT;
-  const int srcExpBits = srcBits - srcSigBits - 1;
   const int srcInfExp = (1 << srcExpBits) - 1;
   const int srcExpBias = srcInfExp >> 1;
 
-  const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigBits;
-  const src_rep_t srcSignificandMask = srcMinNormal - 1;
-  const src_rep_t srcInfinity = (src_rep_t)srcInfExp << srcSigBits;
-  const src_rep_t srcSignMask = SRC_REP_C(1) << (srcSigBits + srcExpBits);
-  const src_rep_t srcAbsMask = srcSignMask - 1;
-  const src_rep_t roundMask = (SRC_REP_C(1) << (srcSigBits - dstSigBits)) - 1;
-  const src_rep_t halfway = SRC_REP_C(1) << (srcSigBits - dstSigBits - 1);
-  const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigBits - 1);
+  const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigFracBits;
+  const src_rep_t roundMask =
+      (SRC_REP_C(1) << (srcSigFracBits - dstSigFracBits)) - 1;
+  const src_rep_t halfway = SRC_REP_C(1)
+                            << (srcSigFracBits - dstSigFracBits - 1);
+  const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigFracBits - 1);
   const src_rep_t srcNaNCode = srcQNaN - 1;
 
-  const int dstBits = sizeof(dst_t) * CHAR_BIT;
-  const int dstExpBits = dstBits - dstSigBits - 1;
   const int dstInfExp = (1 << dstExpBits) - 1;
   const int dstExpBias = dstInfExp >> 1;
-
-  const int underflowExponent = srcExpBias + 1 - dstExpBias;
   const int overflowExponent = srcExpBias + dstInfExp - dstExpBias;
-  const src_rep_t underflow = (src_rep_t)underflowExponent << srcSigBits;
-  const src_rep_t overflow = (src_rep_t)overflowExponent << srcSigBits;
 
-  const dst_rep_t dstQNaN = DST_REP_C(1) << (dstSigBits - 1);
+  const dst_rep_t dstQNaN = DST_REP_C(1) << (dstSigFracBits - 1);
   const dst_rep_t dstNaNCode = dstQNaN - 1;
 
-  // Break a into a sign and representation of the absolute value.
   const src_rep_t aRep = srcToRep(a);
-  const src_rep_t aAbs = aRep & srcAbsMask;
-  const src_rep_t sign = aRep & srcSignMask;
-  dst_rep_t absResult;
+  const src_rep_t srcSign = extract_sign_from_src(aRep);
+  const src_rep_t srcExp = extract_exp_from_src(aRep);
+  const src_rep_t srcSigFrac = extract_sig_frac_from_src(aRep);
+
+  dst_rep_t dstSign = srcSign;
+  dst_rep_t dstExp;
+  dst_rep_t dstSigFrac;
+
+  // Same size exponents and a's significand tail is 0.
+  // The significand can be truncated and the exponent can be copied over.
+  const int sigFracTailBits = srcSigFracBits - dstSigFracBits;
+  if (srcExpBits == dstExpBits &&
+      ((aRep >> sigFracTailBits) << sigFracTailBits) == aRep) {
+    dstExp = srcExp;
+    dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits);
+    return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac));
+  }
 
-  if (aAbs - underflow < aAbs - overflow) {
+  const int dstExpCandidate = ((int)srcExp - srcExpBias) + dstExpBias;
+  if (dstExpCandidate >= 1 && dstExpCandidate < dstInfExp) {
     // The exponent of a is within the range of normal numbers in the
-    // destination format.  We can convert by simply right-shifting with
+    // destination format. We can convert by simply right-shifting with
     // rounding and adjusting the exponent.
-    absResult = aAbs >> (srcSigBits - dstSigBits);
-    absResult -= (dst_rep_t)(srcExpBias - dstExpBias) << dstSigBits;
+    dstExp = dstExpCandidate;
+    dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits);
 
-    const src_rep_t roundBits = aAbs & roundMask;
+    const src_rep_t roundBits = srcSigFrac & roundMask;
     // Round to nearest.
     if (roundBits > halfway)
-      absResult++;
+      dstSigFrac++;
     // Tie to even.
     else if (roundBits == halfway)
-      absResult += absResult & 1;
-  } else if (aAbs > srcInfinity) {
+      dstSigFrac += dstSigFrac & 1;
+
+    // Rounding has changed the exponent.
+    if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) {
+      dstExp += 1;
+      dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits);
+    }
+  } else if (srcExp == srcInfExp && srcSigFrac) {
     // a is NaN.
     // Conjure the result by beginning with infinity, setting the qNaN
     // bit and inserting the (truncated) trailing NaN field.
-    absResult = (dst_rep_t)dstInfExp << dstSigBits;
-    absResult |= dstQNaN;
-    absResult |=
-        ((aAbs & srcNaNCode) >> (srcSigBits - dstSigBits)) & dstNaNCode;
-  } else if (aAbs >= overflow) {
-    // a overflows to infinity.
-    absResult = (dst_rep_t)dstInfExp << dstSigBits;
+    dstExp = dstInfExp;
+    dstSigFrac = dstQNaN;
+    dstSigFrac |= ((srcSigFrac & srcNaNCode) >> sigFracTailBits) & dstNaNCode;
+  } else if ((int)srcExp >= overflowExponent) {
+    dstExp = dstInfExp;
+    dstSigFrac = 0;
   } else {
     // a underflows on conversion to the destination type or is an exact
     // zero.  The result may be a denormal or zero.  Extract the exponent
     // to get the shift amount for the denormalization.
-    const int aExp = aAbs >> srcSigBits;
-    const int shift = srcExpBias - dstExpBias - aExp + 1;
+    src_rep_t significand = srcSigFrac;
+    int shift = srcExpBias - dstExpBias - srcExp;
 
-    const src_rep_t significand = (aRep & srcSignificandMask) | srcMinNormal;
+    if (srcExp) {
+      // Set the implicit integer bit if the source is a normal number.
+      significand |= srcMinNormal;
+      shift += 1;
+    }
 
     // Right shift by the denormalization amount with sticky.
-    if (shift > srcSigBits) {
-      absResult = 0;
+    if (shift > srcSigFracBits) {
+      dstExp = 0;
+      dstSigFrac = 0;
     } else {
-      const bool sticky = (significand << (srcBits - shift)) != 0;
+      dstExp = 0;
+      const bool sticky = shift && ((significand << (srcBits - shift)) != 0);
       src_rep_t denormalizedSignificand = significand >> shift | sticky;
-      absResult = denormalizedSignificand >> (srcSigBits - dstSigBits);
+      dstSigFrac = denormalizedSignificand >> sigFracTailBits;
       const src_rep_t roundBits = denormalizedSignificand & roundMask;
       // Round to nearest
       if (roundBits > halfway)
-        absResult++;
+        dstSigFrac++;
       // Ties to even
       else if (roundBits == halfway)
-        absResult += absResult & 1;
+        dstSigFrac += dstSigFrac & 1;
+
+      // Rounding has changed the exponent.
+      if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) {
+        dstExp += 1;
+        dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits);
+      }
     }
   }
 
-  // Apply the signbit to the absolute value.
-  const dst_rep_t result = absResult | sign >> (srcBits - dstBits);
-  return dstFromRep(result);
+  return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac));
 }
diff --git a/compiler-rt/lib/builtins/gcc_personality_v0.c b/compiler-rt/lib/builtins/gcc_personality_v0.c
index 58fd7ce..ef63a5f 100644
--- a/compiler-rt/lib/builtins/gcc_personality_v0.c
+++ b/compiler-rt/lib/builtins/gcc_personality_v0.c
@@ -219,7 +219,7 @@ COMPILER_RT_ABI _Unwind_Reason_Code __gcc_personality_v0(
   }
   // Walk call-site table looking for range that includes current PC.
   uint8_t callSiteEncoding = *lsda++;
-  uint32_t callSiteTableLength = readULEB128(&lsda);
+  size_t callSiteTableLength = readULEB128(&lsda);
   const uint8_t *callSiteTableStart = lsda;
   const uint8_t *callSiteTableEnd = callSiteTableStart + callSiteTableLength;
   const uint8_t *p = callSiteTableStart;
diff --git a/compiler-rt/lib/builtins/i386/chkstk.S b/compiler-rt/lib/builtins/i386/chkstk.S
index f0bea21..cdd9a4c 100644
--- a/compiler-rt/lib/builtins/i386/chkstk.S
+++ b/compiler-rt/lib/builtins/i386/chkstk.S
@@ -4,19 +4,19 @@
 
 #include "../assembly.h"
 
-// _chkstk routine
+#ifdef __i386__
+
+// _chkstk (_alloca) routine - probe stack between %esp and (%esp-%eax) in 4k increments,
+// then decrement %esp by %eax.  Preserves all registers except %esp and flags.
 // This routine is windows specific
 // http://msdn.microsoft.com/en-us/library/ms648426.aspx
 
-#ifdef __i386__
-
 .text
 .balign 4
-DEFINE_COMPILERRT_FUNCTION(__chkstk_ms)
+DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function
         push   %ecx
-        push   %eax
         cmp    $0x1000,%eax
-        lea    12(%esp),%ecx
+        lea    8(%esp),%ecx     // esp before calling this routine -> ecx
         jb     1f
 2:
         sub    $0x1000,%ecx
@@ -27,9 +27,13 @@ DEFINE_COMPILERRT_FUNCTION(__chkstk_ms)
 1:
         sub    %eax,%ecx
         test   %ecx,(%ecx)
-        pop    %eax
-        pop    %ecx
+
+        lea    4(%esp),%eax     // load pointer to the return address into eax
+        mov    %ecx,%esp        // install the new top of stack pointer into esp
+        mov    -4(%eax),%ecx    // restore ecx
+        push   (%eax)           // push return address onto the stack
+        sub    %esp,%eax        // restore the original value in eax
         ret
-END_COMPILERRT_FUNCTION(__chkstk_ms)
+END_COMPILERRT_FUNCTION(_alloca)
 
 #endif // __i386__
diff --git a/compiler-rt/lib/builtins/i386/chkstk2.S b/compiler-rt/lib/builtins/i386/chkstk2.S
deleted file mode 100644
index 5d6cbdf..0000000
--- a/compiler-rt/lib/builtins/i386/chkstk2.S
+++ /dev/null
@@ -1,41 +0,0 @@
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "../assembly.h"
-
-#ifdef __i386__
-
-// _chkstk (_alloca) routine - probe stack between %esp and (%esp-%eax) in 4k increments,
-// then decrement %esp by %eax.  Preserves all registers except %esp and flags.
-// This routine is windows specific
-// http://msdn.microsoft.com/en-us/library/ms648426.aspx
-
-.text
-.balign 4
-DEFINE_COMPILERRT_FUNCTION(_alloca) // _chkstk and _alloca are the same function
-DEFINE_COMPILERRT_FUNCTION(__chkstk)
-        push   %ecx
-        cmp    $0x1000,%eax
-        lea    8(%esp),%ecx     // esp before calling this routine -> ecx
-        jb     1f
-2:
-        sub    $0x1000,%ecx
-        test   %ecx,(%ecx)
-        sub    $0x1000,%eax
-        cmp    $0x1000,%eax
-        ja     2b
-1:
-        sub    %eax,%ecx
-        test   %ecx,(%ecx)
-
-        lea    4(%esp),%eax     // load pointer to the return address into eax
-        mov    %ecx,%esp        // install the new top of stack pointer into esp
-        mov    -4(%eax),%ecx    // restore ecx
-        push   (%eax)           // push return address onto the stack
-        sub    %esp,%eax        // restore the original value in eax
-        ret
-END_COMPILERRT_FUNCTION(__chkstk)
-END_COMPILERRT_FUNCTION(_alloca)
-
-#endif // __i386__
diff --git a/compiler-rt/lib/builtins/i386/floatdixf.S b/compiler-rt/lib/builtins/i386/floatdixf.S
index 19dd083..486e3b0 100644
--- a/compiler-rt/lib/builtins/i386/floatdixf.S
+++ b/compiler-rt/lib/builtins/i386/floatdixf.S
@@ -4,7 +4,7 @@
 
 #include "../assembly.h"
 
-// long double __floatdixf(di_int a);
+// xf_float __floatdixf(di_int a);
 
 #ifdef __i386__
 
diff --git a/compiler-rt/lib/builtins/i386/floatundixf.S b/compiler-rt/lib/builtins/i386/floatundixf.S
index 30b4d9f..778c3dc 100644
--- a/compiler-rt/lib/builtins/i386/floatundixf.S
+++ b/compiler-rt/lib/builtins/i386/floatundixf.S
@@ -4,7 +4,7 @@
 
 #include "../assembly.h"
 
-// long double __floatundixf(du_int a);16
+// xf_float __floatundixf(du_int a);16
 
 #ifdef __i386__
 
diff --git a/compiler-rt/lib/builtins/int_lib.h b/compiler-rt/lib/builtins/int_lib.h
index fb791eb..f6c1b7c 100644
--- a/compiler-rt/lib/builtins/int_lib.h
+++ b/compiler-rt/lib/builtins/int_lib.h
@@ -49,7 +49,7 @@
 #define SYMBOL_NAME(name) XSTR(__USER_LABEL_PREFIX__) #name
 
 #if defined(__ELF__) || defined(__MINGW32__) || defined(__wasm__) ||           \
-    defined(_AIX)
+    defined(_AIX)    || defined(__CYGWIN__)
 #define COMPILER_RT_ALIAS(name, aliasname) \
   COMPILER_RT_ABI __typeof(name) aliasname __attribute__((__alias__(#name)));
 #elif defined(__APPLE__)
@@ -119,14 +119,14 @@ COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem);
 #if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>
 
-int __inline __builtin_ctz(uint32_t value) {
+static int __inline __builtin_ctz(uint32_t value) {
   unsigned long trailing_zero = 0;
   if (_BitScanForward(&trailing_zero, value))
     return trailing_zero;
   return 32;
 }
 
-int __inline __builtin_clz(uint32_t value) {
+static int __inline __builtin_clz(uint32_t value) {
   unsigned long leading_zero = 0;
   if (_BitScanReverse(&leading_zero, value))
     return 31 - leading_zero;
@@ -134,14 +134,14 @@ int __inline __builtin_clz(uint32_t value) {
 }
 
 #if defined(_M_ARM) || defined(_M_X64)
-int __inline __builtin_clzll(uint64_t value) {
+static int __inline __builtin_clzll(uint64_t value) {
   unsigned long leading_zero = 0;
   if (_BitScanReverse64(&leading_zero, value))
     return 63 - leading_zero;
   return 64;
 }
 #else
-int __inline __builtin_clzll(uint64_t value) {
+static int __inline __builtin_clzll(uint64_t value) {
   if (value == 0)
     return 64;
   uint32_t msh = (uint32_t)(value >> 32);
@@ -154,7 +154,7 @@ int __inline __builtin_clzll(uint64_t value) {
 
 #define __builtin_clzl __builtin_clzll
 
-bool __inline __builtin_sadd_overflow(int x, int y, int *result) {
+static bool __inline __builtin_sadd_overflow(int x, int y, int *result) {
   if ((x < 0) != (y < 0)) {
     *result = x + y;
     return false;
diff --git a/compiler-rt/lib/builtins/int_math.h b/compiler-rt/lib/builtins/int_math.h
index 48b9580..08bfe92 100644
--- a/compiler-rt/lib/builtins/int_math.h
+++ b/compiler-rt/lib/builtins/int_math.h
@@ -65,6 +65,14 @@
 #define crt_copysign(x, y) __builtin_copysign((x), (y))
 #define crt_copysignf(x, y) __builtin_copysignf((x), (y))
 #define crt_copysignl(x, y) __builtin_copysignl((x), (y))
+// We define __has_builtin to always return 0 for GCC versions below 10,
+// but __builtin_copysignf128 is available since version 7.
+#if __has_builtin(__builtin_copysignf128) ||                                   \
+    (defined(__GNUC__) && __GNUC__ >= 7)
+#define crt_copysignf128(x, y) __builtin_copysignf128((x), (y))
+#elif __has_builtin(__builtin_copysignq)
+#define crt_copysignf128(x, y) __builtin_copysignq((x), (y))
+#endif
 #endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -75,6 +83,13 @@
 #define crt_fabs(x) __builtin_fabs((x))
 #define crt_fabsf(x) __builtin_fabsf((x))
 #define crt_fabsl(x) __builtin_fabsl((x))
+// We define __has_builtin to always return 0 for GCC versions below 10,
+// but __builtin_fabsf128 is available since version 7.
+#if __has_builtin(__builtin_fabsf128) || (defined(__GNUC__) && __GNUC__ >= 7)
+#define crt_fabsf128(x) __builtin_fabsf128((x))
+#elif __has_builtin(__builtin_fabsq)
+#define crt_fabsf128(x) __builtin_fabsq((x))
+#endif
 #endif
 
 #if defined(_MSC_VER) && !defined(__clang__)
diff --git a/compiler-rt/lib/builtins/int_mulo_impl.inc b/compiler-rt/lib/builtins/int_mulo_impl.inc
index 567d8b9..27e7c8c 100644
--- a/compiler-rt/lib/builtins/int_mulo_impl.inc
+++ b/compiler-rt/lib/builtins/int_mulo_impl.inc
@@ -18,10 +18,10 @@
 
 static __inline fixint_t __muloXi4(fixint_t a, fixint_t b, int *overflow) {
   const int N = (int)(sizeof(fixint_t) * CHAR_BIT);
-  const fixint_t MIN = (fixint_t)1 << (N - 1);
+  const fixint_t MIN = (fixint_t)((fixuint_t)1 << (N - 1));
   const fixint_t MAX = ~MIN;
   *overflow = 0;
-  fixint_t result = a * b;
+  fixint_t result = (fixuint_t)a * b;
   if (a == MIN) {
     if (b != 0 && b != 1)
       *overflow = 1;
diff --git a/compiler-rt/lib/builtins/int_mulv_impl.inc b/compiler-rt/lib/builtins/int_mulv_impl.inc
index 1e92071..06559cf 100644
--- a/compiler-rt/lib/builtins/int_mulv_impl.inc
+++ b/compiler-rt/lib/builtins/int_mulv_impl.inc
@@ -18,7 +18,7 @@
 
 static __inline fixint_t __mulvXi3(fixint_t a, fixint_t b) {
   const int N = (int)(sizeof(fixint_t) * CHAR_BIT);
-  const fixint_t MIN = (fixint_t)1 << (N - 1);
+  const fixint_t MIN = (fixint_t)((fixuint_t)1 << (N - 1));
   const fixint_t MAX = ~MIN;
   if (a == MIN) {
     if (b == 0 || b == 1)
diff --git a/compiler-rt/lib/builtins/int_to_fp.h b/compiler-rt/lib/builtins/int_to_fp.h
new file mode 100644
index 0000000..2c1218f
--- /dev/null
+++ b/compiler-rt/lib/builtins/int_to_fp.h
@@ -0,0 +1,82 @@
+//===-- int_to_fp.h - integer to floating point conversion ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Set source and destination defines in order to use a correctly
+// parameterised floatXiYf implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INT_TO_FP_H
+#define INT_TO_FP_H
+
+#include "int_lib.h"
+
+#if defined SRC_I64
+typedef int64_t src_t;
+typedef uint64_t usrc_t;
+static __inline int clzSrcT(usrc_t x) { return __builtin_clzll(x); }
+
+#elif defined SRC_U64
+typedef uint64_t src_t;
+typedef uint64_t usrc_t;
+static __inline int clzSrcT(usrc_t x) { return __builtin_clzll(x); }
+
+#elif defined SRC_I128
+typedef __int128_t src_t;
+typedef __uint128_t usrc_t;
+static __inline int clzSrcT(usrc_t x) { return __clzti2(x); }
+
+#elif defined SRC_U128
+typedef __uint128_t src_t;
+typedef __uint128_t usrc_t;
+static __inline int clzSrcT(usrc_t x) { return __clzti2(x); }
+
+#else
+#error Source should be a handled integer type.
+#endif
+
+#if defined DST_SINGLE
+typedef float dst_t;
+typedef uint32_t dst_rep_t;
+#define DST_REP_C UINT32_C
+
+enum {
+  dstSigBits = 23,
+};
+
+#elif defined DST_DOUBLE
+typedef double dst_t;
+typedef uint64_t dst_rep_t;
+#define DST_REP_C UINT64_C
+
+enum {
+  dstSigBits = 52,
+};
+
+#elif defined DST_QUAD
+typedef tf_float dst_t;
+typedef __uint128_t dst_rep_t;
+#define DST_REP_C (__uint128_t)
+
+enum {
+  dstSigBits = 112,
+};
+
+#else
+#error Destination should be a handled floating point type
+#endif
+
+static __inline dst_t dstFromRep(dst_rep_t x) {
+  const union {
+    dst_t f;
+    dst_rep_t i;
+  } rep = {.i = x};
+  return rep.f;
+}
+
+#endif // INT_TO_FP_H
diff --git a/compiler-rt/lib/builtins/int_to_fp_impl.inc b/compiler-rt/lib/builtins/int_to_fp_impl.inc
new file mode 100644
index 0000000..51f76fd
--- /dev/null
+++ b/compiler-rt/lib/builtins/int_to_fp_impl.inc
@@ -0,0 +1,72 @@
+//===-- int_to_fp_impl.inc - integer to floating point conversion ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Thsi file implements a generic conversion from an integer type to an
+// IEEE-754 floating point type, allowing a common implementation to be hsared
+// without copy and paste.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_to_fp.h"
+
+static __inline dst_t __floatXiYf__(src_t a) {
+  if (a == 0)
+    return 0.0;
+
+  enum {
+    dstMantDig = dstSigBits + 1,
+    srcBits = sizeof(src_t) * CHAR_BIT,
+    srcIsSigned = ((src_t)-1) < 0,
+  };
+
+  const src_t s = srcIsSigned ? a >> (srcBits - 1) : 0;
+
+  a = (usrc_t)(a ^ s) - s;
+  int sd = srcBits - clzSrcT(a);         // number of significant digits
+  int e = sd - 1;                        // exponent
+  if (sd > dstMantDig) {
+    //  start:  0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
+    //  finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
+    //                                                12345678901234567890123456
+    //  1 = msb 1 bit
+    //  P = bit dstMantDig-1 bits to the right of 1
+    //  Q = bit dstMantDig bits to the right of 1
+    //  R = "or" of all bits to the right of Q
+    if (sd == dstMantDig + 1) {
+      a <<= 1;
+    } else if (sd == dstMantDig + 2) {
+      // Do nothing.
+    } else {
+      a = ((usrc_t)a >> (sd - (dstMantDig + 2))) |
+          ((a & ((usrc_t)(-1) >> ((srcBits + dstMantDig + 2) - sd))) != 0);
+    }
+    // finish:
+    a |= (a & 4) != 0; // Or P into R
+    ++a;               // round - this step may add a significant bit
+    a >>= 2;           // dump Q and R
+    // a is now rounded to dstMantDig or dstMantDig+1 bits
+    if (a & ((usrc_t)1 << dstMantDig)) {
+      a >>= 1;
+      ++e;
+    }
+    // a is now rounded to dstMantDig bits
+  } else {
+    a <<= (dstMantDig - sd);
+    // a is now rounded to dstMantDig bits
+  }
+  const int dstBits = sizeof(dst_t) * CHAR_BIT;
+  const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1);
+  const int dstExpBits = dstBits - dstSigBits - 1;
+  const int dstExpBias = (1 << (dstExpBits - 1)) - 1;
+  const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1;
+  // Combine sign, exponent, and mantissa.
+  const dst_rep_t result = ((dst_rep_t)s & dstSignMask) |
+                           ((dst_rep_t)(e + dstExpBias) << dstSigBits) |
+                           ((dst_rep_t)(a) & dstSignificandMask);
+  return dstFromRep(result);
+}
diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index e94d315..48862f3 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -107,8 +107,8 @@ typedef union {
 
 static __inline ti_int make_ti(di_int h, di_int l) {
   twords r;
-  r.s.high = h;
-  r.s.low = l;
+  r.s.high = (du_int)h;
+  r.s.low = (du_int)l;
   return r.all;
 }
 
@@ -139,7 +139,6 @@ typedef union {
   udwords u;
   double f;
 } double_bits;
-#endif
 
 typedef struct {
 #if _YUGA_LITTLE_ENDIAN
@@ -165,16 +164,83 @@ typedef struct {
 #define HAS_80_BIT_LONG_DOUBLE 0
 #endif
 
-#if CRT_HAS_FLOATING_POINT
+#if HAS_80_BIT_LONG_DOUBLE
+typedef long double xf_float;
+typedef union {
+  uqwords u;
+  xf_float f;
+} xf_bits;
+#endif
+
+#ifdef __powerpc64__
+// From https://gcc.gnu.org/wiki/Ieee128PowerPC:
+// PowerPC64 uses the following suffixes:
+// IFmode: IBM extended double
+// KFmode: IEEE 128-bit floating point
+// TFmode: Matches the default for long double. With -mabi=ieeelongdouble,
+//         it is IEEE 128-bit, with -mabi=ibmlongdouble IBM extended double
+// Since compiler-rt only implements the tf set of libcalls, we use long double
+// for the tf_float typedef.
+typedef long double tf_float;
+#define CRT_LDBL_128BIT
+#define CRT_HAS_F128
+#if __LDBL_MANT_DIG__ == 113 && !defined(__LONG_DOUBLE_IBM128__)
+#define CRT_HAS_IEEE_TF
+#define CRT_LDBL_IEEE_F128
+#endif
+#define TF_C(x) x##L
+#elif __LDBL_MANT_DIG__ == 113 ||                                              \
+    (__FLT_RADIX__ == 16 && __LDBL_MANT_DIG__ == 28)
+// Use long double instead of __float128 if it matches the IEEE 128-bit format
+// or the IBM hexadecimal format.
+#define CRT_LDBL_128BIT
+#define CRT_HAS_F128
+#if __LDBL_MANT_DIG__ == 113
+#define CRT_HAS_IEEE_TF
+#define CRT_LDBL_IEEE_F128
+#endif
+typedef long double tf_float;
+#define TF_C(x) x##L
+#elif defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
+#define CRT_HAS___FLOAT128_KEYWORD
+#define CRT_HAS_F128
+// NB: we assume the __float128 type uses IEEE representation.
+#define CRT_HAS_IEEE_TF
+typedef __float128 tf_float;
+#define TF_C(x) x##Q
+#endif
+
+#ifdef CRT_HAS_F128
 typedef union {
   uqwords u;
-  long double f;
-} long_double_bits;
+  tf_float f;
+} tf_bits;
+#endif
+
+// __(u)int128_t is currently needed to compile the *tf builtins as we would
+// otherwise need to manually expand the bit manipulation on two 64-bit value.
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
+#define CRT_HAS_TF_MODE
+#endif
 
 #if __STDC_VERSION__ >= 199901L
 typedef float _Complex Fcomplex;
 typedef double _Complex Dcomplex;
 typedef long double _Complex Lcomplex;
+#if defined(CRT_LDBL_128BIT)
+typedef Lcomplex Qcomplex;
+#define CRT_HAS_NATIVE_COMPLEX_F128
+#elif defined(CRT_HAS___FLOAT128_KEYWORD)
+#if defined(__clang_major__) && __clang_major__ > 10
+// Clang prior to 11 did not support __float128 _Complex.
+typedef __float128 _Complex Qcomplex;
+#define CRT_HAS_NATIVE_COMPLEX_F128
+#elif defined(__GNUC__) && __GNUC__ >= 7
+// GCC does not allow __float128 _Complex, but accepts _Float128 _Complex.
+typedef _Float128 _Complex Qcomplex;
+#define CRT_HAS_NATIVE_COMPLEX_F128
+#endif
+#endif
 
 #define COMPLEX_REAL(x) __real__(x)
 #define COMPLEX_IMAGINARY(x) __imag__(x)
@@ -194,5 +260,17 @@ typedef struct {
 #define COMPLEX_REAL(x) (x).real
 #define COMPLEX_IMAGINARY(x) (x).imaginary
 #endif
+
+#ifdef CRT_HAS_NATIVE_COMPLEX_F128
+#define COMPLEXTF_REAL(x) __real__(x)
+#define COMPLEXTF_IMAGINARY(x) __imag__(x)
+#elif defined(CRT_HAS_F128)
+typedef struct {
+  tf_float real, imaginary;
+} Qcomplex;
+#define COMPLEXTF_REAL(x) (x).real
+#define COMPLEXTF_IMAGINARY(x) (x).imaginary
 #endif
+
+#endif // CRT_HAS_FLOATING_POINT
 #endif // INT_TYPES_H
diff --git a/compiler-rt/lib/builtins/lshrti3.c b/compiler-rt/lib/builtins/lshrti3.c
index d00a220..5dc8a0a 100644
--- a/compiler-rt/lib/builtins/lshrti3.c
+++ b/compiler-rt/lib/builtins/lshrti3.c
@@ -18,7 +18,7 @@
 
 // Precondition:  0 <= b < bits_in_tword
 
-COMPILER_RT_ABI ti_int __lshrti3(ti_int a, si_int b) {
+COMPILER_RT_ABI ti_int __lshrti3(ti_int a, int b) {
   const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
   utwords input;
   utwords result;
diff --git a/compiler-rt/lib/builtins/mulodi4.c b/compiler-rt/lib/builtins/mulodi4.c
index 7209676..6ecf926 100644
--- a/compiler-rt/lib/builtins/mulodi4.c
+++ b/compiler-rt/lib/builtins/mulodi4.c
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #define fixint_t di_int
+#define fixuint_t du_int
 #include "int_mulo_impl.inc"
 
 // Returns: a * b
diff --git a/compiler-rt/lib/builtins/mulosi4.c b/compiler-rt/lib/builtins/mulosi4.c
index 4e03c24..3fd18a1 100644
--- a/compiler-rt/lib/builtins/mulosi4.c
+++ b/compiler-rt/lib/builtins/mulosi4.c
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #define fixint_t si_int
+#define fixuint_t su_int
 #include "int_mulo_impl.inc"
 
 // Returns: a * b
diff --git a/compiler-rt/lib/builtins/muloti4.c b/compiler-rt/lib/builtins/muloti4.c
index 9a7aa85..9aab6fc 100644
--- a/compiler-rt/lib/builtins/muloti4.c
+++ b/compiler-rt/lib/builtins/muloti4.c
@@ -19,6 +19,7 @@
 // Effects: sets *overflow to 1  if a * b overflows
 
 #define fixint_t ti_int
+#define fixuint_t tu_int
 #include "int_mulo_impl.inc"
 
 COMPILER_RT_ABI ti_int __muloti4(ti_int a, ti_int b, int *overflow) {
diff --git a/compiler-rt/lib/builtins/multc3.c b/compiler-rt/lib/builtins/multc3.c
index bb7f6aa..a89832f 100644
--- a/compiler-rt/lib/builtins/multc3.c
+++ b/compiler-rt/lib/builtins/multc3.c
@@ -10,56 +10,61 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define QUAD_PRECISION
+#include "fp_lib.h"
 #include "int_lib.h"
 #include "int_math.h"
 
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
+
 // Returns: the product of a + ib and c + id
 
-COMPILER_RT_ABI long double _Complex __multc3(long double a, long double b,
-                                              long double c, long double d) {
-  long double ac = a * c;
-  long double bd = b * d;
-  long double ad = a * d;
-  long double bc = b * c;
-  long double _Complex z;
-  __real__ z = ac - bd;
-  __imag__ z = ad + bc;
-  if (crt_isnan(__real__ z) && crt_isnan(__imag__ z)) {
+COMPILER_RT_ABI Qcomplex __multc3(fp_t a, fp_t b, fp_t c, fp_t d) {
+  fp_t ac = a * c;
+  fp_t bd = b * d;
+  fp_t ad = a * d;
+  fp_t bc = b * c;
+  Qcomplex z;
+  COMPLEXTF_REAL(z) = ac - bd;
+  COMPLEXTF_IMAGINARY(z) = ad + bc;
+  if (crt_isnan(COMPLEXTF_REAL(z)) && crt_isnan(COMPLEXTF_IMAGINARY(z))) {
     int recalc = 0;
     if (crt_isinf(a) || crt_isinf(b)) {
-      a = crt_copysignl(crt_isinf(a) ? 1 : 0, a);
-      b = crt_copysignl(crt_isinf(b) ? 1 : 0, b);
+      a = crt_copysigntf(crt_isinf(a) ? 1 : 0, a);
+      b = crt_copysigntf(crt_isinf(b) ? 1 : 0, b);
       if (crt_isnan(c))
-        c = crt_copysignl(0, c);
+        c = crt_copysigntf(0, c);
       if (crt_isnan(d))
-        d = crt_copysignl(0, d);
+        d = crt_copysigntf(0, d);
       recalc = 1;
     }
     if (crt_isinf(c) || crt_isinf(d)) {
-      c = crt_copysignl(crt_isinf(c) ? 1 : 0, c);
-      d = crt_copysignl(crt_isinf(d) ? 1 : 0, d);
+      c = crt_copysigntf(crt_isinf(c) ? 1 : 0, c);
+      d = crt_copysigntf(crt_isinf(d) ? 1 : 0, d);
       if (crt_isnan(a))
-        a = crt_copysignl(0, a);
+        a = crt_copysigntf(0, a);
       if (crt_isnan(b))
-        b = crt_copysignl(0, b);
+        b = crt_copysigntf(0, b);
       recalc = 1;
     }
     if (!recalc &&
         (crt_isinf(ac) || crt_isinf(bd) || crt_isinf(ad) || crt_isinf(bc))) {
       if (crt_isnan(a))
-        a = crt_copysignl(0, a);
+        a = crt_copysigntf(0, a);
       if (crt_isnan(b))
-        b = crt_copysignl(0, b);
+        b = crt_copysigntf(0, b);
       if (crt_isnan(c))
-        c = crt_copysignl(0, c);
+        c = crt_copysigntf(0, c);
       if (crt_isnan(d))
-        d = crt_copysignl(0, d);
+        d = crt_copysigntf(0, d);
       recalc = 1;
     }
     if (recalc) {
-      __real__ z = CRT_INFINITY * (a * c - b * d);
-      __imag__ z = CRT_INFINITY * (a * d + b * c);
+      COMPLEXTF_REAL(z) = CRT_INFINITY * (a * c - b * d);
+      COMPLEXTF_IMAGINARY(z) = CRT_INFINITY * (a * d + b * c);
     }
   }
   return z;
 }
+
+#endif
diff --git a/compiler-rt/lib/builtins/multf3.c b/compiler-rt/lib/builtins/multf3.c
index 0626fb8..8fd7368 100644
--- a/compiler-rt/lib/builtins/multf3.c
+++ b/compiler-rt/lib/builtins/multf3.c
@@ -14,7 +14,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 #include "fp_mul_impl.inc"
 
 COMPILER_RT_ABI fp_t __multf3(fp_t a, fp_t b) { return __mulXf3__(a, b); }
diff --git a/compiler-rt/lib/builtins/mulvdi3.c b/compiler-rt/lib/builtins/mulvdi3.c
index 1d672c6..d787d29 100644
--- a/compiler-rt/lib/builtins/mulvdi3.c
+++ b/compiler-rt/lib/builtins/mulvdi3.c
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #define fixint_t di_int
+#define fixuint_t du_int
 #include "int_mulv_impl.inc"
 
 // Returns: a * b
diff --git a/compiler-rt/lib/builtins/mulvsi3.c b/compiler-rt/lib/builtins/mulvsi3.c
index 00b2e50..2571881 100644
--- a/compiler-rt/lib/builtins/mulvsi3.c
+++ b/compiler-rt/lib/builtins/mulvsi3.c
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #define fixint_t si_int
+#define fixuint_t su_int
 #include "int_mulv_impl.inc"
 
 // Returns: a * b
diff --git a/compiler-rt/lib/builtins/mulvti3.c b/compiler-rt/lib/builtins/mulvti3.c
index ba35514..fad9b2a 100644
--- a/compiler-rt/lib/builtins/mulvti3.c
+++ b/compiler-rt/lib/builtins/mulvti3.c
@@ -19,6 +19,7 @@
 // Effects: aborts if a * b overflows
 
 #define fixint_t ti_int
+#define fixuint_t tu_int
 #include "int_mulv_impl.inc"
 
 COMPILER_RT_ABI ti_int __mulvti3(ti_int a, ti_int b) { return __mulvXi3(a, b); }
diff --git a/compiler-rt/lib/builtins/mulxc3.c b/compiler-rt/lib/builtins/mulxc3.c
index 2f7f14c..66b5b58 100644
--- a/compiler-rt/lib/builtins/mulxc3.c
+++ b/compiler-rt/lib/builtins/mulxc3.c
@@ -17,12 +17,12 @@
 
 // Returns: the product of a + ib and c + id
 
-COMPILER_RT_ABI Lcomplex __mulxc3(long double __a, long double __b,
-                                  long double __c, long double __d) {
-  long double __ac = __a * __c;
-  long double __bd = __b * __d;
-  long double __ad = __a * __d;
-  long double __bc = __b * __c;
+COMPILER_RT_ABI Lcomplex __mulxc3(xf_float __a, xf_float __b, xf_float __c,
+                                  xf_float __d) {
+  xf_float __ac = __a * __c;
+  xf_float __bd = __b * __d;
+  xf_float __ad = __a * __d;
+  xf_float __bc = __b * __c;
   Lcomplex z;
   COMPLEX_REAL(z) = __ac - __bd;
   COMPLEX_IMAGINARY(z) = __ad + __bc;
diff --git a/compiler-rt/lib/builtins/negdi2.c b/compiler-rt/lib/builtins/negdi2.c
index 5a525d4..714ac8c 100644
--- a/compiler-rt/lib/builtins/negdi2.c
+++ b/compiler-rt/lib/builtins/negdi2.c
@@ -17,5 +17,5 @@
 COMPILER_RT_ABI di_int __negdi2(di_int a) {
   // Note: this routine is here for API compatibility; any sane compiler
   // should expand it inline.
-  return -a;
+  return -(du_int)a;
 }
diff --git a/compiler-rt/lib/builtins/negti2.c b/compiler-rt/lib/builtins/negti2.c
index d52ba4e..ab6e09d 100644
--- a/compiler-rt/lib/builtins/negti2.c
+++ b/compiler-rt/lib/builtins/negti2.c
@@ -19,7 +19,7 @@
 COMPILER_RT_ABI ti_int __negti2(ti_int a) {
   // Note: this routine is here for API compatibility; any sane compiler
   // should expand it inline.
-  return -a;
+  return -(tu_int)a;
 }
 
 #endif // CRT_HAS_128BIT
diff --git a/compiler-rt/lib/builtins/negvdi2.c b/compiler-rt/lib/builtins/negvdi2.c
index 5c52b3e..8c1cf2f 100644
--- a/compiler-rt/lib/builtins/negvdi2.c
+++ b/compiler-rt/lib/builtins/negvdi2.c
@@ -17,7 +17,8 @@
 // Effects: aborts if -a overflows
 
 COMPILER_RT_ABI di_int __negvdi2(di_int a) {
-  const di_int MIN = (di_int)1 << ((int)(sizeof(di_int) * CHAR_BIT) - 1);
+  const di_int MIN =
+      (di_int)((du_int)1 << ((int)(sizeof(di_int) * CHAR_BIT) - 1));
   if (a == MIN)
     compilerrt_abort();
   return -a;
diff --git a/compiler-rt/lib/builtins/negvsi2.c b/compiler-rt/lib/builtins/negvsi2.c
index cccdee6..70f214f 100644
--- a/compiler-rt/lib/builtins/negvsi2.c
+++ b/compiler-rt/lib/builtins/negvsi2.c
@@ -17,7 +17,8 @@
 // Effects: aborts if -a overflows
 
 COMPILER_RT_ABI si_int __negvsi2(si_int a) {
-  const si_int MIN = (si_int)1 << ((int)(sizeof(si_int) * CHAR_BIT) - 1);
+  const si_int MIN =
+      (si_int)((su_int)1 << ((int)(sizeof(si_int) * CHAR_BIT) - 1));
   if (a == MIN)
     compilerrt_abort();
   return -a;
diff --git a/compiler-rt/lib/builtins/negvti2.c b/compiler-rt/lib/builtins/negvti2.c
index 8f92e10..fc14840 100644
--- a/compiler-rt/lib/builtins/negvti2.c
+++ b/compiler-rt/lib/builtins/negvti2.c
@@ -19,7 +19,7 @@
 // Effects: aborts if -a overflows
 
 COMPILER_RT_ABI ti_int __negvti2(ti_int a) {
-  const ti_int MIN = (ti_int)1 << ((int)(sizeof(ti_int) * CHAR_BIT) - 1);
+  const ti_int MIN = (tu_int)1 << ((int)(sizeof(ti_int) * CHAR_BIT) - 1);
   if (a == MIN)
     compilerrt_abort();
   return -a;
diff --git a/compiler-rt/lib/builtins/os_version_check.c b/compiler-rt/lib/builtins/os_version_check.c
index ebfb2df..b10f23a 100644
--- a/compiler-rt/lib/builtins/os_version_check.c
+++ b/compiler-rt/lib/builtins/os_version_check.c
@@ -14,6 +14,7 @@
 #ifdef __APPLE__
 
 #include <TargetConditionals.h>
+#include <assert.h>
 #include <dispatch/dispatch.h>
 #include <dlfcn.h>
 #include <stdint.h>
@@ -86,6 +87,10 @@ typedef Boolean (*CFStringGetCStringFuncTy)(CFStringRef, char *, CFIndex,
                                             CFStringEncoding);
 typedef void (*CFReleaseFuncTy)(CFTypeRef);
 
+extern __attribute__((weak_import))
+bool _availability_version_check(uint32_t count,
+                                 dyld_build_version_t versions[]);
+
 static void _initializeAvailabilityCheck(bool LoadPlist) {
   if (AvailabilityVersionCheck && !LoadPlist) {
     // New API is supported and we're not being asked to load the plist,
@@ -94,8 +99,8 @@ static void _initializeAvailabilityCheck(bool LoadPlist) {
   }
 
   // Use the new API if it's is available.
-  AvailabilityVersionCheck = (AvailabilityVersionCheckFuncTy)dlsym(
-      RTLD_DEFAULT, "_availability_version_check");
+  if (_availability_version_check)
+    AvailabilityVersionCheck = &_availability_version_check;
 
   if (AvailabilityVersionCheck && !LoadPlist) {
     // New API is supported and we're not being asked to load the plist,
@@ -266,6 +271,8 @@ static inline uint32_t ConstructVersion(uint32_t Major, uint32_t Minor,
   return ((Major & 0xffff) << 16) | ((Minor & 0xff) << 8) | (Subminor & 0xff);
 }
 
+#define PLATFORM_MACOS 1
+
 int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major,
                                    uint32_t Minor, uint32_t Subminor) {
   dispatch_once_f(&DispatchOnceCounter, NULL, initializeAvailabilityCheck);
@@ -278,6 +285,29 @@ int32_t __isPlatformVersionAtLeast(uint32_t Platform, uint32_t Major,
   return AvailabilityVersionCheck(1, Versions);
 }
 
+#if TARGET_OS_OSX
+
+int32_t __isPlatformOrVariantPlatformVersionAtLeast(
+    uint32_t Platform, uint32_t Major, uint32_t Minor, uint32_t Subminor,
+    uint32_t Platform2, uint32_t Major2, uint32_t Minor2, uint32_t Subminor2) {
+  dispatch_once_f(&DispatchOnceCounter, NULL, initializeAvailabilityCheck);
+
+  if (!AvailabilityVersionCheck) {
+    // Handle case of back-deployment for older macOS.
+    if (Platform == PLATFORM_MACOS) {
+      return __isOSVersionAtLeast(Major, Minor, Subminor);
+    }
+    assert(Platform2 == PLATFORM_MACOS && "unexpected platform");
+    return __isOSVersionAtLeast(Major2, Minor2, Subminor2);
+  }
+  dyld_build_version_t Versions[] = {
+      {Platform, ConstructVersion(Major, Minor, Subminor)},
+      {Platform2, ConstructVersion(Major2, Minor2, Subminor2)}};
+  return AvailabilityVersionCheck(2, Versions);
+}
+
+#endif
+
 #elif __ANDROID__
 
 #include <pthread.h>
@@ -312,8 +342,8 @@ int32_t __isOSVersionAtLeast(int32_t Major, int32_t Minor, int32_t Subminor) {
   static pthread_once_t once = PTHREAD_ONCE_INIT;
   pthread_once(&once, readSystemProperties);
 
-  return SdkVersion >= Major ||
-         (IsPreRelease && Major == __ANDROID_API_FUTURE__);
+  // Allow all on pre-release. Note that we still rely on compile-time checks.
+  return SdkVersion >= Major || IsPreRelease;
 }
 
 #else
diff --git a/compiler-rt/lib/builtins/powitf2.c b/compiler-rt/lib/builtins/powitf2.c
index 8e639a0..e02db40 100644
--- a/compiler-rt/lib/builtins/powitf2.c
+++ b/compiler-rt/lib/builtins/powitf2.c
@@ -13,13 +13,13 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 
 // Returns: a ^ b
 
-COMPILER_RT_ABI long double __powitf2(long double a, int b) {
+COMPILER_RT_ABI fp_t __powitf2(fp_t a, int b) {
   const int recip = b < 0;
-  long double r = 1;
+  fp_t r = 1;
   while (1) {
     if (b & 1)
       r *= a;
diff --git a/compiler-rt/lib/builtins/powixf2.c b/compiler-rt/lib/builtins/powixf2.c
index 3edfe9f..ab8c694 100644
--- a/compiler-rt/lib/builtins/powixf2.c
+++ b/compiler-rt/lib/builtins/powixf2.c
@@ -16,9 +16,9 @@
 
 // Returns: a ^ b
 
-COMPILER_RT_ABI long double __powixf2(long double a, int b) {
+COMPILER_RT_ABI xf_float __powixf2(xf_float a, int b) {
   const int recip = b < 0;
-  long double r = 1;
+  xf_float r = 1;
   while (1) {
     if (b & 1)
       r *= a;
diff --git a/compiler-rt/lib/builtins/riscv/fp_mode.c b/compiler-rt/lib/builtins/riscv/fp_mode.c
index c542c34..1a5a3de 100644
--- a/compiler-rt/lib/builtins/riscv/fp_mode.c
+++ b/compiler-rt/lib/builtins/riscv/fp_mode.c
@@ -15,7 +15,7 @@
 #define RISCV_INEXACT    0x1
 
 CRT_FE_ROUND_MODE __fe_getround(void) {
-#if defined(__riscv_f)
+#if defined(__riscv_f) || defined(__riscv_zfinx)
   int frm;
   __asm__ __volatile__("frrm %0" : "=r" (frm));
   switch (frm) {
@@ -35,7 +35,7 @@ CRT_FE_ROUND_MODE __fe_getround(void) {
 }
 
 int __fe_raise_inexact(void) {
-#if defined(__riscv_f)
+#if defined(__riscv_f) || defined(__riscv_zfinx)
   __asm__ __volatile__("csrsi fflags, %0" :: "i" (RISCV_INEXACT));
 #endif
   return 0;
diff --git a/compiler-rt/lib/builtins/riscv/restore.S b/compiler-rt/lib/builtins/riscv/restore.S
index 73f64a9..d87dfc1 100644
--- a/compiler-rt/lib/builtins/riscv/restore.S
+++ b/compiler-rt/lib/builtins/riscv/restore.S
@@ -22,6 +22,8 @@
 
 #if __riscv_xlen == 32
 
+#ifndef __riscv_abi_rve
+
   .globl  __riscv_restore_12
   .type   __riscv_restore_12,@function
 __riscv_restore_12:
@@ -86,8 +88,29 @@ __riscv_restore_0:
   addi    sp, sp, 16
   ret
 
+#else
+
+  .globl  __riscv_restore_2
+  .type   __riscv_restore_2,@function
+  .globl  __riscv_restore_1
+  .type   __riscv_restore_1,@function
+  .globl  __riscv_restore_0
+  .type   __riscv_restore_0,@function
+__riscv_restore_2:
+__riscv_restore_1:
+__riscv_restore_0:
+  lw      s1,  0(sp)
+  lw      s0,  4(sp)
+  lw      ra,  8(sp)
+  addi    sp, sp, 12
+  ret
+
+#endif
+
 #elif __riscv_xlen == 64
 
+#ifndef __riscv_abi_rve
+
   .globl  __riscv_restore_12
   .type   __riscv_restore_12,@function
 __riscv_restore_12:
@@ -161,6 +184,25 @@ __riscv_restore_0:
   addi    sp, sp, 16
   ret
 
+#else
+
+  .globl  __riscv_restore_2
+  .type   __riscv_restore_2,@function
+  .globl  __riscv_restore_1
+  .type   __riscv_restore_1,@function
+  .globl  __riscv_restore_0
+  .type   __riscv_restore_0,@function
+__riscv_restore_2:
+__riscv_restore_1:
+__riscv_restore_0:
+  ld      s1,  0(sp)
+  ld      s0,  8(sp)
+  ld      ra,  16(sp)
+  addi    sp, sp, 24
+  ret
+
+#endif
+
 #else
 # error "xlen must be 32 or 64 for save-restore implementation
 #endif
diff --git a/compiler-rt/lib/builtins/riscv/save.S b/compiler-rt/lib/builtins/riscv/save.S
index 85501ae..6324e05 100644
--- a/compiler-rt/lib/builtins/riscv/save.S
+++ b/compiler-rt/lib/builtins/riscv/save.S
@@ -18,6 +18,8 @@
 
 #if __riscv_xlen == 32
 
+#ifndef __riscv_abi_rve
+
   .globl  __riscv_save_12
   .type   __riscv_save_12,@function
 __riscv_save_12:
@@ -92,8 +94,29 @@ __riscv_save_0:
   sw      ra,  12(sp)
   jr      t0
 
+#else
+
+  .globl  __riscv_save_2
+  .type   __riscv_save_2,@function
+  .globl  __riscv_save_1
+  .type   __riscv_save_1,@function
+  .globl  __riscv_save_0
+  .type   __riscv_save_0,@function
+__riscv_save_2:
+__riscv_save_1:
+__riscv_save_0:
+  addi    sp, sp, -12
+  sw      s1,  0(sp)
+  sw      s0,  4(sp)
+  sw      ra,  8(sp)
+  jr      t0
+
+#endif
+
 #elif __riscv_xlen == 64
 
+#ifndef __riscv_abi_rve
+
   .globl  __riscv_save_12
   .type   __riscv_save_12,@function
 __riscv_save_12:
@@ -181,6 +204,25 @@ __riscv_save_0:
   sd     ra, 8(sp)
   jr     t0
 
+#else
+
+  .globl  __riscv_save_2
+  .type   __riscv_save_2,@function
+  .globl  __riscv_save_1
+  .type   __riscv_save_1,@function
+  .globl  __riscv_save_0
+  .type   __riscv_save_0,@function
+__riscv_save_2:
+__riscv_save_1:
+__riscv_save_0:
+  addi   sp, sp, -24
+  sd     s1, 0(sp)
+  sd     s0, 8(sp)
+  sd     ra, 16(sp)
+  jr     t0
+
+#endif
+
 #else
 # error "xlen must be 32 or 64 for save-restore implementation
 #endif
diff --git a/compiler-rt/lib/builtins/subtf3.c b/compiler-rt/lib/builtins/subtf3.c
index 3364c28..e1b1022 100644
--- a/compiler-rt/lib/builtins/subtf3.c
+++ b/compiler-rt/lib/builtins/subtf3.c
@@ -13,7 +13,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 COMPILER_RT_ABI fp_t __addtf3(fp_t a, fp_t b);
 
 // Subtraction; flip the sign bit of b and add.
diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c
index 844eb27..830e25e 100644
--- a/compiler-rt/lib/builtins/trampoline_setup.c
+++ b/compiler-rt/lib/builtins/trampoline_setup.c
@@ -41,3 +41,45 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
   __clear_cache(trampOnStack, &trampOnStack[10]);
 }
 #endif // __powerpc__ && !defined(__powerpc64__)
+
+// The AArch64 compiler generates calls to __trampoline_setup() when creating
+// trampoline functions on the stack for use with nested functions.
+// This function creates a custom 36-byte trampoline function on the stack
+// which loads x18 with a pointer to the outer function's locals
+// and then jumps to the target nested function.
+// Note: x18 is a reserved platform register on Windows and macOS.
+
+#if defined(__aarch64__) && defined(__ELF__)
+COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
+                                        int trampSizeAllocated,
+                                        const void *realFunc, void *localsPtr) {
+  // This should never happen, but if compiler did not allocate
+  // enough space on stack for the trampoline, abort.
+  if (trampSizeAllocated < 36)
+    compilerrt_abort();
+
+  // create trampoline
+  // Load realFunc into x17. mov/movk 16 bits at a time.
+  trampOnStack[0] =
+      0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
+  trampOnStack[1] =
+      0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
+  trampOnStack[2] =
+      0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
+  trampOnStack[3] =
+      0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
+  // Load localsPtr into x18
+  trampOnStack[4] =
+      0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
+  trampOnStack[5] =
+      0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
+  trampOnStack[6] =
+      0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
+  trampOnStack[7] =
+      0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
+  trampOnStack[8] = 0xd61f0220; // br x17
+
+  // Clear instruction cache.
+  __clear_cache(trampOnStack, &trampOnStack[9]);
+}
+#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
diff --git a/compiler-rt/lib/builtins/trunctfdf2.c b/compiler-rt/lib/builtins/trunctfdf2.c
index 6857ea5..a5bdded 100644
--- a/compiler-rt/lib/builtins/trunctfdf2.c
+++ b/compiler-rt/lib/builtins/trunctfdf2.c
@@ -9,11 +9,11 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 #define SRC_QUAD
 #define DST_DOUBLE
 #include "fp_trunc_impl.inc"
 
-COMPILER_RT_ABI double __trunctfdf2(long double a) { return __truncXfYf2__(a); }
+COMPILER_RT_ABI dst_t __trunctfdf2(src_t a) { return __truncXfYf2__(a); }
 
 #endif
diff --git a/compiler-rt/lib/builtins/trunctfhf2.c b/compiler-rt/lib/builtins/trunctfhf2.c
index e3a2309..3f031e0 100644
--- a/compiler-rt/lib/builtins/trunctfhf2.c
+++ b/compiler-rt/lib/builtins/trunctfhf2.c
@@ -10,14 +10,11 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT) &&                     \
-    defined(COMPILER_RT_HAS_FLOAT16)
+#if defined(CRT_HAS_TF_MODE) && defined(COMPILER_RT_HAS_FLOAT16)
 #define SRC_QUAD
 #define DST_HALF
 #include "fp_trunc_impl.inc"
 
-COMPILER_RT_ABI _Float16 __trunctfhf2(long double a) {
-  return __truncXfYf2__(a);
-}
+COMPILER_RT_ABI dst_t __trunctfhf2(src_t a) { return __truncXfYf2__(a); }
 
 #endif
diff --git a/compiler-rt/lib/builtins/trunctfsf2.c b/compiler-rt/lib/builtins/trunctfsf2.c
index 0261b1e..b65b5af 100644
--- a/compiler-rt/lib/builtins/trunctfsf2.c
+++ b/compiler-rt/lib/builtins/trunctfsf2.c
@@ -9,11 +9,11 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_128BIT) && defined(CRT_LDBL_128BIT)
+#if defined(CRT_HAS_TF_MODE)
 #define SRC_QUAD
 #define DST_SINGLE
 #include "fp_trunc_impl.inc"
 
-COMPILER_RT_ABI float __trunctfsf2(long double a) { return __truncXfYf2__(a); }
+COMPILER_RT_ABI dst_t __trunctfsf2(src_t a) { return __truncXfYf2__(a); }
 
 #endif
diff --git a/compiler-rt/lib/builtins/trunctfxf2.c b/compiler-rt/lib/builtins/trunctfxf2.c
new file mode 100644
index 0000000..49bd32d
--- /dev/null
+++ b/compiler-rt/lib/builtins/trunctfxf2.c
@@ -0,0 +1,23 @@
+//===-- lib/trunctfsf2.c - long double -> quad conversion ---------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits.
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && __LDBL_MANT_DIG__ == 64 && defined(__x86_64__)
+
+#define SRC_QUAD
+#define DST_80
+#include "fp_trunc_impl.inc"
+
+COMPILER_RT_ABI xf_float __trunctfxf2(tf_float a) { return __truncXfYf2__(a); }
+
+#endif
diff --git a/compiler-rt/lib/builtins/x86_64/chkstk2.S b/compiler-rt/lib/builtins/x86_64/chkstk2.S
deleted file mode 100644
index 33d10d5..0000000
--- a/compiler-rt/lib/builtins/x86_64/chkstk2.S
+++ /dev/null
@@ -1,43 +0,0 @@
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "../assembly.h"
-
-#ifdef __x86_64__
-
-// _chkstk (_alloca) routine - probe stack between %rsp and (%rsp-%rax) in 4k increments,
-// then decrement %rsp by %rax.  Preserves all registers except %rsp and flags.
-// This routine is windows specific
-// http://msdn.microsoft.com/en-us/library/ms648426.aspx
-
-.text
-.balign 4
-DEFINE_COMPILERRT_FUNCTION(__alloca)
-        mov    %rcx,%rax        // x64 _alloca is a normal function with parameter in rcx
-        // fallthrough
-DEFINE_COMPILERRT_FUNCTION(___chkstk)
-        push   %rcx
-        cmp    $0x1000,%rax
-        lea    16(%rsp),%rcx     // rsp before calling this routine -> rcx
-        jb     1f
-2:
-        sub    $0x1000,%rcx
-        test   %rcx,(%rcx)
-        sub    $0x1000,%rax
-        cmp    $0x1000,%rax
-        ja     2b
-1:
-        sub    %rax,%rcx
-        test   %rcx,(%rcx)
-
-        lea    8(%rsp),%rax     // load pointer to the return address into rax
-        mov    %rcx,%rsp        // install the new top of stack pointer into rsp
-        mov    -8(%rax),%rcx    // restore rcx
-        push   (%rax)           // push return address onto the stack
-        sub    %rsp,%rax        // restore the original value in rax
-        ret
-END_COMPILERRT_FUNCTION(___chkstk)
-END_COMPILERRT_FUNCTION(__alloca)
-
-#endif // __x86_64__
diff --git a/compiler-rt/lib/builtins/x86_64/floatdixf.c b/compiler-rt/lib/builtins/x86_64/floatdixf.c
index cf8450c..54636e2 100644
--- a/compiler-rt/lib/builtins/x86_64/floatdixf.c
+++ b/compiler-rt/lib/builtins/x86_64/floatdixf.c
@@ -2,12 +2,12 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-// long double __floatdixf(di_int a);
+// xf_float __floatdixf(di_int a);
 
 #ifdef __x86_64__
 
 #include "../int_lib.h"
 
-long double __floatdixf(int64_t a) { return (long double)a; }
+xf_float __floatdixf(int64_t a) { return (xf_float)a; }
 
 #endif // __i386__
diff --git a/compiler-rt/lib/builtins/x86_64/floatundixf.S b/compiler-rt/lib/builtins/x86_64/floatundixf.S
index 9e3bced..cf7286f 100644
--- a/compiler-rt/lib/builtins/x86_64/floatundixf.S
+++ b/compiler-rt/lib/builtins/x86_64/floatundixf.S
@@ -4,7 +4,7 @@
 
 #include "../assembly.h"
 
-// long double __floatundixf(du_int a);
+// xf_float __floatundixf(du_int a);
 
 #ifdef __x86_64__