OpenNMT · 3manifold · Jan 29, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -12,6 +12,7 @@ option(WITH_DNNL "Compile with DNNL backend" OFF)
 option(WITH_ACCELERATE "Compile with Accelerate backend" OFF)
 option(WITH_OPENBLAS "Compile with OpenBLAS backend" OFF)
 option(WITH_RUY "Compile with Ruy backend" OFF)
+option(WITH_CANN "Compile with CANN backend" OFF)
 option(WITH_CUDA "Compile with CUDA backend" OFF)
 option(WITH_CUDNN "Compile with cuDNN backend" OFF)
 option(CUDA_DYNAMIC_LOADING "Dynamically load CUDA libraries at runtime" OFF)
@@ -21,6 +22,12 @@ option(BUILD_CLI "Compile the clients" ON)
 option(BUILD_TESTS "Compile the tests" OFF)
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 
+if(WITH_CUDA OR WITH_CUDNN)
+  if(WITH_CANN)
+    message( FATAL_ERROR "CANN backend cannot be combined with CUDA or CUDNN!" )
+  endif ()
+endif ()
+
 if(ENABLE_PROFILING)
   message(STATUS "Enable profiling support")
   add_definitions(-DCT2_ENABLE_PROFILING)
@@ -525,6 +532,105 @@ if (WITH_CUDA)
     )
 elseif(WITH_CUDNN)
   message(FATAL_ERROR "WITH_CUDNN=ON requires WITH_CUDA=ON")
+elseif(WITH_CANN)
+  add_definitions(-DCT2_WITH_CANN)
+
+  message(STATUS "ASCEND_TOOLKIT_HOME: $ENV{ASCEND_TOOLKIT_HOME}")
+  message(STATUS "LD_LIBRARY_PATH: $ENV{LD_LIBRARY_PATH}")
+  message(STATUS "PYTHONPATH: $ENV{PYTHONPATH}")
+  message(STATUS "ASCEND_AICPU_PATH: $ENV{ASCEND_AICPU_PATH}")
+  message(STATUS "ASCEND_OPP_PATH: $ENV{ASCEND_OPP_PATH}")
+  message(STATUS "TOOLCHAIN_HOME: $ENV{TOOLCHAIN_HOME}")
+  message(STATUS "ASCEND_HOME_PATH: $ENV{ASCEND_HOME_PATH}")
+  message(STATUS "PATH: $ENV{PATH}")
+
+  if(DEFINED ENV{ASCEND_CUSTOM_PATH})
+    set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
+  else()
+    set(ASCEND_DIR /usr/local/Ascend)
+  endif()
+
+  message(STATUS "ASCEND_DIR: ${ASCEND_DIR}")
+
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+
+  set(extern_ascend ascend_ge ascend_graph atlas_acl CACHE INTERNAL "acllib runtime libs")
+
+  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+
+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
+  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
+  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
+  set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
+
+  message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
+  message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
+  INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
+  INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
+
+  ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
+
+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
+  ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
+
+  set(extern_ascend_cl ascendcl acl_op_compiler CACHE INTERNAL "acltoolkit libs")
+
+  list(APPEND SOURCES
+          src/cann/allocator.cc
+          src/cann/primitives.cc
+          src/cann/utils.cc
+          src/ops/topk_npu.cc
+          src/ops/dequantize_npu.cc
+          src/ops/gumbel_max_npu.cc
+          src/ops/topp_mask_npu.cc
+          src/ops/multinomial_npu.cc
+          src/ops/gather_npu.cc
+          src/ops/conv1d_npu.cc
+          src/ops/concat_split_slide_npu.cc
+          src/ops/alibi_add_npu.cc
+          src/ops/softmax_npu.cc
+          src/ops/tile_npu.cc
+          src/ops/rms_norm_npu.cc
+          src/ops/layer_norm_npu.cc
+          src/ops/rotary_npu.cc
+          src/ops/bias_add_npu.cc
+          src/ops/mean_npu.cc
+          src/ops/quantize_npu.cc)
+  add_library(${PROJECT_NAME} ${SOURCES})
+  list(APPEND LIBRARIES ${extern_ascend} ${extern_ascend_cl})
 else()
   add_library(${PROJECT_NAME} ${SOURCES})
 endif()
@@ -540,6 +646,7 @@ set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY
 )
 
 list(APPEND LIBRARIES ${CMAKE_DL_LIBS})
+
 target_link_libraries(${PROJECT_NAME} PRIVATE ${LIBRARIES})
 target_include_directories(${PROJECT_NAME} BEFORE
   PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>

diff --git a/README.md b/README.md
@@ -25,12 +25,12 @@ The project is production-oriented and comes with [backward compatibility guaran
 
 ## Key features
 
-* **Fast and efficient execution on CPU and GPU**<br/>The execution [is significantly faster and requires less resources](#benchmarks) than general-purpose deep learning frameworks on supported models and tasks thanks to many advanced optimizations: layer fusion, padding removal, batch reordering, in-place operations, caching mechanism, etc.
+* **Fast and efficient execution on CPU, GPU and NPU**<br/>The execution [is significantly faster and requires less resources](#benchmarks) than general-purpose deep learning frameworks on supported models and tasks thanks to many advanced optimizations: layer fusion, padding removal, batch reordering, in-place operations, caching mechanism, etc.
 * **Quantization and reduced precision**<br/>The model serialization and computation support weights with [reduced precision](https://opennmt.net/CTranslate2/quantization.html): 16-bit floating points (FP16), 16-bit brain floating points (BF16), 16-bit integers (INT16), and 8-bit integers (INT8).
 * **Multiple CPU architectures support**<br/>The project supports x86-64 and AArch64/ARM64 processors and integrates multiple backends that are optimized for these platforms: [Intel MKL](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html), [oneDNN](https://github.com/oneapi-src/oneDNN), [OpenBLAS](https://www.openblas.net/), [Ruy](https://github.com/google/ruy), and [Apple Accelerate](https://developer.apple.com/documentation/accelerate).
 * **Automatic CPU detection and code dispatch**<br/>One binary can include multiple backends (e.g. Intel MKL and oneDNN) and instruction set architectures (e.g. AVX, AVX2) that are automatically selected at runtime based on the CPU information.
-* **Parallel and asynchronous execution**<br/>Multiple batches can be processed in parallel and asynchronously using multiple GPUs or CPU cores.
-* **Dynamic memory usage**<br/>The memory usage changes dynamically depending on the request size while still meeting performance requirements thanks to caching allocators on both CPU and GPU.
+* **Parallel and asynchronous execution**<br/>Multiple batches can be processed in parallel and asynchronously using multiple GPUs, NPUs or CPU cores.
+* **Dynamic memory usage**<br/>The memory usage changes dynamically depending on the request size while still meeting performance requirements thanks to caching allocators on all CPU, GPU and NPU.
 * **Lightweight on disk**<br/>Quantization can make the models 4 times smaller on disk with minimal accuracy loss.
 * **Simple integration**<br/>The project has few dependencies and exposes simple APIs in [Python](https://opennmt.net/CTranslate2/python/overview.html) and C++ to cover most integration needs.
 * **Configurable and interactive decoding**<br/>[Advanced decoding features](https://opennmt.net/CTranslate2/decoding.html) allow autocompleting a partial sequence and returning alternatives at a specific location in the sequence.

diff --git a/cli/translator.cc b/cli/translator.cc
@@ -30,7 +30,7 @@ int main(int argc, char* argv[]) {
      cxxopts::value<size_t>()->default_value("1"))
     ("intra_threads", "Number of computation threads (set to 0 to use the default value).",
      cxxopts::value<size_t>()->default_value("0"))
-    ("device", "Device to use (can be cpu, cuda, auto).",
+    ("device", "Device to use (can be cpu, cuda, cann, auto).",
      cxxopts::value<std::string>()->default_value("cpu"))
     ("device_index", "Comma-separated list of device IDs to use.",
      cxxopts::value<std::vector<int>>()->default_value("0"))
@@ -44,6 +44,8 @@ int main(int argc, char* argv[]) {
      cxxopts::value<std::string>()->default_value("default"))
     ("cuda_compute_type", "Computation type on CUDA devices (overrides compute_type)",
      cxxopts::value<std::string>())
+    ("cann_compute_type", "Computation type on CANN devices (overrides compute_type)",
+     cxxopts::value<std::string>())
     ("cpu_compute_type", "Computation type on CPU devices (overrides compute_type)",
      cxxopts::value<std::string>())
     ;
@@ -139,6 +141,10 @@ int main(int argc, char* argv[]) {
     if (args.count("cuda_compute_type"))
       compute_type = ctranslate2::str_to_compute_type(args["cuda_compute_type"].as<std::string>());
     break;
+  case ctranslate2::Device::CANN:
+    if (args.count("cann_compute_type"))
+      compute_type = ctranslate2::str_to_compute_type(args["cann_compute_type"].as<std::string>());
+    break;
   };
 
   ctranslate2::ReplicaPoolConfig pool_config;

diff --git a/docker/cann/Dockerfile_cann b/docker/cann/Dockerfile_cann
@@ -0,0 +1,78 @@
+# Extened/build an image for CANN support
+# Ascend-cann-toolkit_<VERSION>.run is expected to exist in <project_root>/ascend_install_files
+
+# preferably arm64
+FROM ubuntu:20.04
+
+RUN DEBIAN_FRONTEND="noninteractive" apt update && \
+    apt install --no-install-recommends net-tools -y && \
+    apt install --no-install-recommends libsqlite3-dev -y && \
+    apt install --no-install-recommends zlib1g -y && \
+    apt install --no-install-recommends openssl -y  
+
+RUN DEBIAN_FRONTEND="noninteractive" apt update && \
+    apt install --no-install-recommends ca-certificates -y && \
+    apt install --no-install-recommends bc wget -y && \
+    apt install --no-install-recommends curl gdb cmake gcc make g++ pkg-config unzip -y && \
+    apt install --no-install-recommends libblas3 liblapack3 gfortran vim -y && \
+    apt install --no-install-recommends liblapack-dev libblas-dev libhdf5-dev libffi-dev -y && \
+    apt install --no-install-recommends libssl-dev zlib1g-dev xz-utils cython3 python3-h5py -y && \
+    apt install --no-install-recommends libopenblas-dev libgmpxx4ldbl liblzma-dev -y && \
+    apt install --no-install-recommends pciutils  -y
+
+
+RUN DEBIAN_FRONTEND="noninteractive" apt update && \
+    apt-get install -y --no-install-recommends \
+        python3-dev \
+        python3-pip \
+        wget
+
+RUN python3 -m pip --no-cache-dir install numpy && \
+    python3 -m pip --no-cache-dir install decorator && \
+    python3 -m pip --no-cache-dir install sympy && \
+    python3 -m pip --no-cache-dir install cffi  && \
+    python3 -m pip --no-cache-dir install pyyaml && \
+    python3 -m pip --no-cache-dir install pathlib2 && \
+    python3 -m pip --no-cache-dir install protobuf && \
+    python3 -m pip --no-cache-dir install scipy
+
+RUN python3 -m pip --no-cache-dir install psutil && \
+    python3 -m pip --no-cache-dir install requests absl-py
+
+RUN python3 -m pip --no-cache-dir install attrs
+
+# cleanup actions 
+RUN rm -rf /root/.cache/pip
+RUN DEBIAN_FRONTEND="noninteractive" apt clean && rm -rf /var/lib/apt/lists/*
+RUN DEBIAN_FRONTEND="noninteractive" apt autoremove && apt autoclean
+
+# Install Ascend toolkit
+COPY ascend_install_files  ascend_install_files
+RUN chmod +x ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run && \
+  ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run --install && \
+  rm -f ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run   
+
+# Add usergroup & user
+RUN groupadd HwHiAiUser &&     useradd -g HwHiAiUser -m -d /home/HwHiAiUser HwHiAiUser  
+
+# This is copied from /usr/local/Ascend/ascend-toolkit/set_env.sh of the respective ascend-toolkit version
+ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:$LD_LIBRARY_PATH
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:$PYTHONPATH
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:$PATH
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+
+# ENV LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/hdf5/serial:$LD_LIBRARY_PATH
+# ENV HCCL_CONNECT_TIMEOUT=7200
+# ENV HCCL_WHITELIST_DISABLE=1
+# ENV HCCL_SECURITY_MODE=1 
+
+ENV ASCEND_GLOBAL_LOG_LEVEL=3
+
+# Set env vars again in case of interactive ssh connection (ascend-toolkit assumed to be already installed)
+RUN cp /usr/local/Ascend/ascend-toolkit/set_env.sh /etc/profile.d/
+RUN chmod 644 /etc/profile.d/set_env.sh
diff --git a/docker/cann/run_container_cann.sh b/docker/cann/run_container_cann.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# build image that will host CANN environment
+cd ../../
+docker build -t ctranslate2-aarch64 -f docker/cann/Dockerfile_cann --platform linux/arm64 .
+
+# run the respective container
+docker run  \
+-d --cap-add sys_ptrace \
+--pids-limit 409600 \
+--privileged --shm-size=128G \
+-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+-v /usr/local/dcmi:/usr/local/dcmi \
+--name ctranslate2-aarch64 <container>
diff --git a/docs/hardware_support.md b/docs/hardware_support.md
@@ -20,3 +20,10 @@ See the [environment variables](environment_variables.md) `CT2_USE_MKL` and `CT2
 * NVIDIA GPUs with a Compute Capability greater or equal to 3.5
 
 The driver requirement depends on the CUDA version. See the [CUDA Compatibility guide](https://docs.nvidia.com/deploy/cuda-compatibility/index.html) for more information.
+
+## NPU
+
+* AArch64/ARM64 processors
+* Ascend NPU AI Processor greater or equal to 910A
+
+`CANN` version greater or equal to `7.0.RC1.alpha001` (depends on NPU model). See [CANN documentation](https://support.huawei.com/enterprise/en/ascend-computing/cann-pid-251168373) for more information.
diff --git a/examples/cann/CMakeLists.txt b/examples/cann/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.7)
+project(cann)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_BUILD_TYPE Release)
+find_package(Threads)
+add_executable(cann_run main.cc)
+target_link_libraries(cann_run PRIVATE
+  ${CMAKE_THREAD_LIBS_INIT}
+  ctranslate2
+  )
diff --git a/examples/cann/README.md b/examples/cann/README.md
@@ -0,0 +1,45 @@
+# CANN example query
+This example demonstrates a translation query employing `CANN` using the English-German Transformer model trained with OpenNMT-py as in [CTranslate2 documentation](https://opennmt.net/CTranslate2/quickstart.html).
+
+## Environment setup  
+- Create  environment:`docker/cann/Dockerfile_cann`
+- Run the container: `docker/cann/run_container_cann.sh`
+
+## Download model
+```bash
+wget https://s3.amazonaws.com/opennmt-models/transformer-ende-wmt-pyOnmt.tar.gz
+tar xf transformer-ende-wmt-pyOnmt.tar.gz
+```
+
+## Build executable
+Run `examples/cann/build_run.sh`
+
+### Expected output
+
+```
+current path: "<current path>"
+input data path: "<input data path>"
+[<timestamp>] [ctranslate2] [thread 49835] [info] CPU: ARM (NEON=true)
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - Selected ISA: NEON
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - Use Intel MKL: false
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - SGEMM backend: OpenBLAS (packed: false)
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - GEMM_S16 backend: none (packed: false)
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - GEMM_S8 backend: Ruy (packed: false, u8s8 preferred: false)
+[<timestamp>] [ctranslate2] [thread 49835] [info] NPU:
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - Number of NPU cores: 8
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - aclrtRunMode: ACL_HOST
+[<timestamp>] [ctranslate2] [thread 49835] [info] Loaded model <path> on device cann:0
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - Binary version: 6
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - Model specification revision: 7
+[<timestamp>] [ctranslate2] [thread 49835] [info]  - Selected compute type: float32
+input data:
+▁H ello ▁world !
+Start: Warmup examples
+output:
+▁Hallo ▁Welt !
+input data:
+▁H ello ▁world !
+Start: Query examples
+output:
+▁Hallo ▁Welt !
+```
diff --git a/examples/cann/build_run.sh b/examples/cann/build_run.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# execute from project root
+
+# first build ct2lib
+rm -rf build-release/
+mkdir build-release && cd build-release || exit
+
+cmake -DWITH_CANN=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_CLI=OFF -DWITH_MKL=OFF -DOPENMP_RUNTIME=COMP -DCMAKE_PREFIX_PATH="/opt/OpenBLAS" -DWITH_OPENBLAS=ON -DWITH_RUY=ON ..
+
+make -j"$(nproc)"
+
+rm CMakeCache.txt
+
+# then build cann_run
+cmake -DCMAKE_BUILD_TYPE=Release ../examples/cann/
+
+make -j"$(nproc)"
+# ./cann_run <ende_ctranslate2_path>