Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CANN Backend support #1606

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ option(WITH_DNNL "Compile with DNNL backend" OFF)
option(WITH_ACCELERATE "Compile with Accelerate backend" OFF)
option(WITH_OPENBLAS "Compile with OpenBLAS backend" OFF)
option(WITH_RUY "Compile with Ruy backend" OFF)
option(WITH_CANN "Compile with CANN backend" OFF)
option(WITH_CUDA "Compile with CUDA backend" OFF)
option(WITH_CUDNN "Compile with cuDNN backend" OFF)
option(CUDA_DYNAMIC_LOADING "Dynamically load CUDA libraries at runtime" OFF)
Expand All @@ -21,6 +22,12 @@ option(BUILD_CLI "Compile the clients" ON)
option(BUILD_TESTS "Compile the tests" OFF)
option(BUILD_SHARED_LIBS "Build shared libraries" ON)

if(WITH_CUDA OR WITH_CUDNN)
if(WITH_CANN)
message( FATAL_ERROR "CANN backend cannot be combined with CUDA or CUDNN!" )
endif ()
endif ()

if(ENABLE_PROFILING)
message(STATUS "Enable profiling support")
add_definitions(-DCT2_ENABLE_PROFILING)
Expand Down Expand Up @@ -525,6 +532,105 @@ if (WITH_CUDA)
)
elseif(WITH_CUDNN)
message(FATAL_ERROR "WITH_CUDNN=ON requires WITH_CUDA=ON")
elseif(WITH_CANN)
add_definitions(-DCT2_WITH_CANN)

message(STATUS "ASCEND_TOOLKIT_HOME: $ENV{ASCEND_TOOLKIT_HOME}")
message(STATUS "LD_LIBRARY_PATH: $ENV{LD_LIBRARY_PATH}")
message(STATUS "PYTHONPATH: $ENV{PYTHONPATH}")
message(STATUS "ASCEND_AICPU_PATH: $ENV{ASCEND_AICPU_PATH}")
message(STATUS "ASCEND_OPP_PATH: $ENV{ASCEND_OPP_PATH}")
message(STATUS "TOOLCHAIN_HOME: $ENV{TOOLCHAIN_HOME}")
message(STATUS "ASCEND_HOME_PATH: $ENV{ASCEND_HOME_PATH}")
message(STATUS "PATH: $ENV{PATH}")

if(DEFINED ENV{ASCEND_CUSTOM_PATH})
set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
else()
set(ASCEND_DIR /usr/local/Ascend)
endif()

message(STATUS "ASCEND_DIR: ${ASCEND_DIR}")

set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})

set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})

set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})

ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})

ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})

ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})

set(extern_ascend ascend_ge ascend_graph atlas_acl CACHE INTERNAL "acllib runtime libs")

set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)

set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)

message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})

ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})

ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})

ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})

set(extern_ascend_cl ascendcl acl_op_compiler CACHE INTERNAL "acltoolkit libs")

list(APPEND SOURCES
src/cann/allocator.cc
src/cann/primitives.cc
src/cann/utils.cc
src/ops/topk_npu.cc
src/ops/dequantize_npu.cc
src/ops/gumbel_max_npu.cc
src/ops/topp_mask_npu.cc
src/ops/multinomial_npu.cc
src/ops/gather_npu.cc
src/ops/conv1d_npu.cc
src/ops/concat_split_slide_npu.cc
src/ops/alibi_add_npu.cc
src/ops/softmax_npu.cc
src/ops/tile_npu.cc
src/ops/rms_norm_npu.cc
src/ops/layer_norm_npu.cc
src/ops/rotary_npu.cc
src/ops/bias_add_npu.cc
src/ops/mean_npu.cc
src/ops/quantize_npu.cc)
add_library(${PROJECT_NAME} ${SOURCES})
list(APPEND LIBRARIES ${extern_ascend} ${extern_ascend_cl})
else()
add_library(${PROJECT_NAME} ${SOURCES})
endif()
Expand All @@ -540,6 +646,7 @@ set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY
)

list(APPEND LIBRARIES ${CMAKE_DL_LIBS})

target_link_libraries(${PROJECT_NAME} PRIVATE ${LIBRARIES})
target_include_directories(${PROJECT_NAME} BEFORE
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ The project is production-oriented and comes with [backward compatibility guaran

## Key features

* **Fast and efficient execution on CPU and GPU**<br/>The execution [is significantly faster and requires less resources](#benchmarks) than general-purpose deep learning frameworks on supported models and tasks thanks to many advanced optimizations: layer fusion, padding removal, batch reordering, in-place operations, caching mechanism, etc.
* **Fast and efficient execution on CPU, GPU and NPU**<br/>The execution [is significantly faster and requires less resources](#benchmarks) than general-purpose deep learning frameworks on supported models and tasks thanks to many advanced optimizations: layer fusion, padding removal, batch reordering, in-place operations, caching mechanism, etc.
* **Quantization and reduced precision**<br/>The model serialization and computation support weights with [reduced precision](https://opennmt.net/CTranslate2/quantization.html): 16-bit floating points (FP16), 16-bit brain floating points (BF16), 16-bit integers (INT16), and 8-bit integers (INT8).
* **Multiple CPU architectures support**<br/>The project supports x86-64 and AArch64/ARM64 processors and integrates multiple backends that are optimized for these platforms: [Intel MKL](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html), [oneDNN](https://github.com/oneapi-src/oneDNN), [OpenBLAS](https://www.openblas.net/), [Ruy](https://github.com/google/ruy), and [Apple Accelerate](https://developer.apple.com/documentation/accelerate).
* **Automatic CPU detection and code dispatch**<br/>One binary can include multiple backends (e.g. Intel MKL and oneDNN) and instruction set architectures (e.g. AVX, AVX2) that are automatically selected at runtime based on the CPU information.
* **Parallel and asynchronous execution**<br/>Multiple batches can be processed in parallel and asynchronously using multiple GPUs or CPU cores.
* **Dynamic memory usage**<br/>The memory usage changes dynamically depending on the request size while still meeting performance requirements thanks to caching allocators on both CPU and GPU.
* **Parallel and asynchronous execution**<br/>Multiple batches can be processed in parallel and asynchronously using multiple GPUs, NPUs or CPU cores.
* **Dynamic memory usage**<br/>The memory usage changes dynamically depending on the request size while still meeting performance requirements thanks to caching allocators on all CPU, GPU and NPU.
* **Lightweight on disk**<br/>Quantization can make the models 4 times smaller on disk with minimal accuracy loss.
* **Simple integration**<br/>The project has few dependencies and exposes simple APIs in [Python](https://opennmt.net/CTranslate2/python/overview.html) and C++ to cover most integration needs.
* **Configurable and interactive decoding**<br/>[Advanced decoding features](https://opennmt.net/CTranslate2/decoding.html) allow autocompleting a partial sequence and returning alternatives at a specific location in the sequence.
Expand Down
8 changes: 7 additions & 1 deletion cli/translator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ int main(int argc, char* argv[]) {
cxxopts::value<size_t>()->default_value("1"))
("intra_threads", "Number of computation threads (set to 0 to use the default value).",
cxxopts::value<size_t>()->default_value("0"))
("device", "Device to use (can be cpu, cuda, auto).",
("device", "Device to use (can be cpu, cuda, cann, auto).",
cxxopts::value<std::string>()->default_value("cpu"))
("device_index", "Comma-separated list of device IDs to use.",
cxxopts::value<std::vector<int>>()->default_value("0"))
Expand All @@ -44,6 +44,8 @@ int main(int argc, char* argv[]) {
cxxopts::value<std::string>()->default_value("default"))
("cuda_compute_type", "Computation type on CUDA devices (overrides compute_type)",
cxxopts::value<std::string>())
("cann_compute_type", "Computation type on CANN devices (overrides compute_type)",
cxxopts::value<std::string>())
("cpu_compute_type", "Computation type on CPU devices (overrides compute_type)",
cxxopts::value<std::string>())
;
Expand Down Expand Up @@ -139,6 +141,10 @@ int main(int argc, char* argv[]) {
if (args.count("cuda_compute_type"))
compute_type = ctranslate2::str_to_compute_type(args["cuda_compute_type"].as<std::string>());
break;
case ctranslate2::Device::CANN:
if (args.count("cann_compute_type"))
compute_type = ctranslate2::str_to_compute_type(args["cann_compute_type"].as<std::string>());
break;
};

ctranslate2::ReplicaPoolConfig pool_config;
Expand Down
78 changes: 78 additions & 0 deletions docker/cann/Dockerfile_cann
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Extened/build an image for CANN support
# Ascend-cann-toolkit_<VERSION>.run is expected to exist in <project_root>/ascend_install_files

# preferably arm64
FROM ubuntu:20.04

RUN DEBIAN_FRONTEND="noninteractive" apt update && \
apt install --no-install-recommends net-tools -y && \
apt install --no-install-recommends libsqlite3-dev -y && \
apt install --no-install-recommends zlib1g -y && \
apt install --no-install-recommends openssl -y

RUN DEBIAN_FRONTEND="noninteractive" apt update && \
apt install --no-install-recommends ca-certificates -y && \
apt install --no-install-recommends bc wget -y && \
apt install --no-install-recommends curl gdb cmake gcc make g++ pkg-config unzip -y && \
apt install --no-install-recommends libblas3 liblapack3 gfortran vim -y && \
apt install --no-install-recommends liblapack-dev libblas-dev libhdf5-dev libffi-dev -y && \
apt install --no-install-recommends libssl-dev zlib1g-dev xz-utils cython3 python3-h5py -y && \
apt install --no-install-recommends libopenblas-dev libgmpxx4ldbl liblzma-dev -y && \
apt install --no-install-recommends pciutils -y


RUN DEBIAN_FRONTEND="noninteractive" apt update && \
apt-get install -y --no-install-recommends \
python3-dev \
python3-pip \
wget

RUN python3 -m pip --no-cache-dir install numpy && \
python3 -m pip --no-cache-dir install decorator && \
python3 -m pip --no-cache-dir install sympy && \
python3 -m pip --no-cache-dir install cffi && \
python3 -m pip --no-cache-dir install pyyaml && \
python3 -m pip --no-cache-dir install pathlib2 && \
python3 -m pip --no-cache-dir install protobuf && \
python3 -m pip --no-cache-dir install scipy

RUN python3 -m pip --no-cache-dir install psutil && \
python3 -m pip --no-cache-dir install requests absl-py

RUN python3 -m pip --no-cache-dir install attrs

# cleanup actions
RUN rm -rf /root/.cache/pip
RUN DEBIAN_FRONTEND="noninteractive" apt clean && rm -rf /var/lib/apt/lists/*
RUN DEBIAN_FRONTEND="noninteractive" apt autoremove && apt autoclean

# Install Ascend toolkit
COPY ascend_install_files ascend_install_files
RUN chmod +x ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run && \
ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run --install && \
rm -f ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run

# Add usergroup & user
RUN groupadd HwHiAiUser && useradd -g HwHiAiUser -m -d /home/HwHiAiUser HwHiAiUser

# This is copied from /usr/local/Ascend/ascend-toolkit/set_env.sh of the respective ascend-toolkit version
ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:$LD_LIBRARY_PATH
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:$PYTHONPATH
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:$PATH
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}

# ENV LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/hdf5/serial:$LD_LIBRARY_PATH
# ENV HCCL_CONNECT_TIMEOUT=7200
# ENV HCCL_WHITELIST_DISABLE=1
# ENV HCCL_SECURITY_MODE=1

ENV ASCEND_GLOBAL_LOG_LEVEL=3

# Set env vars again in case of interactive ssh connection (ascend-toolkit assumed to be already installed)
RUN cp /usr/local/Ascend/ascend-toolkit/set_env.sh /etc/profile.d/
RUN chmod 644 /etc/profile.d/set_env.sh
15 changes: 15 additions & 0 deletions docker/cann/run_container_cann.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# build image that will host CANN environment
cd ../../
docker build -t ctranslate2-aarch64 -f docker/cann/Dockerfile_cann --platform linux/arm64 .

# run the respective container
docker run \
-d --cap-add sys_ptrace \
--pids-limit 409600 \
--privileged --shm-size=128G \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/dcmi:/usr/local/dcmi \
--name ctranslate2-aarch64 <container>
7 changes: 7 additions & 0 deletions docs/hardware_support.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,10 @@ See the [environment variables](environment_variables.md) `CT2_USE_MKL` and `CT2
* NVIDIA GPUs with a Compute Capability greater or equal to 3.5

The driver requirement depends on the CUDA version. See the [CUDA Compatibility guide](https://docs.nvidia.com/deploy/cuda-compatibility/index.html) for more information.

## NPU

* AArch64/ARM64 processors
* Ascend NPU AI Processor greater or equal to 910A

`CANN` version greater or equal to `7.0.RC1.alpha001` (depends on NPU model). See [CANN documentation](https://support.huawei.com/enterprise/en/ascend-computing/cann-pid-251168373) for more information.
10 changes: 10 additions & 0 deletions examples/cann/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cmake_minimum_required(VERSION 3.7)
project(cann)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)
find_package(Threads)
add_executable(cann_run main.cc)
target_link_libraries(cann_run PRIVATE
${CMAKE_THREAD_LIBS_INIT}
ctranslate2
)
45 changes: 45 additions & 0 deletions examples/cann/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# CANN example query
This example demonstrates a translation query employing `CANN` using the English-German Transformer model trained with OpenNMT-py as in [CTranslate2 documentation](https://opennmt.net/CTranslate2/quickstart.html).

## Environment setup
- Create environment:`docker/cann/Dockerfile_cann`
- Run the container: `docker/cann/run_container_cann.sh`

## Download model
```bash
wget https://s3.amazonaws.com/opennmt-models/transformer-ende-wmt-pyOnmt.tar.gz
tar xf transformer-ende-wmt-pyOnmt.tar.gz
```

## Build executable
Run `examples/cann/build_run.sh`

### Expected output

```
current path: "<current path>"
input data path: "<input data path>"
[<timestamp>] [ctranslate2] [thread 49835] [info] CPU: ARM (NEON=true)
[<timestamp>] [ctranslate2] [thread 49835] [info] - Selected ISA: NEON
[<timestamp>] [ctranslate2] [thread 49835] [info] - Use Intel MKL: false
[<timestamp>] [ctranslate2] [thread 49835] [info] - SGEMM backend: OpenBLAS (packed: false)
[<timestamp>] [ctranslate2] [thread 49835] [info] - GEMM_S16 backend: none (packed: false)
[<timestamp>] [ctranslate2] [thread 49835] [info] - GEMM_S8 backend: Ruy (packed: false, u8s8 preferred: false)
[<timestamp>] [ctranslate2] [thread 49835] [info] NPU:
[<timestamp>] [ctranslate2] [thread 49835] [info] - Number of NPU cores: 8
[<timestamp>] [ctranslate2] [thread 49835] [info] - aclrtRunMode: ACL_HOST
[<timestamp>] [ctranslate2] [thread 49835] [info] Loaded model <path> on device cann:0
[<timestamp>] [ctranslate2] [thread 49835] [info] - Binary version: 6
[<timestamp>] [ctranslate2] [thread 49835] [info] - Model specification revision: 7
[<timestamp>] [ctranslate2] [thread 49835] [info] - Selected compute type: float32
input data:
▁H ello ▁world !
Start: Warmup examples
output:
▁Hallo ▁Welt !
input data:
▁H ello ▁world !
Start: Query examples
output:
▁Hallo ▁Welt !
```
19 changes: 19 additions & 0 deletions examples/cann/build_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

# execute from project root

# first build ct2lib
rm -rf build-release/
mkdir build-release && cd build-release || exit

cmake -DWITH_CANN=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_CLI=OFF -DWITH_MKL=OFF -DOPENMP_RUNTIME=COMP -DCMAKE_PREFIX_PATH="/opt/OpenBLAS" -DWITH_OPENBLAS=ON -DWITH_RUY=ON ..

make -j"$(nproc)"

rm CMakeCache.txt

# then build cann_run
cmake -DCMAKE_BUILD_TYPE=Release ../examples/cann/

make -j"$(nproc)"
# ./cann_run <ende_ctranslate2_path>
Loading
Loading