diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a5fac5..4e90db1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,19 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/) and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.5.0] + +### Added +* `scripts/ubuntu_setup.sh` for setting up a GPU-based Ubuntu EC2 AMI. +* `scripts/amazon_linux_setup.sh` for setting up a GPU-based Amazon Linux 2023 EC2 AMI. + +### Changed +* Refactored `scripts/build_proc.sh` to combine GPU compilation steps. +* Final product zip archive is now always created. + +### Fixed +* `Dockerfile.gpu` so that outputs will contain actual data. + ## [0.4.0] ### Added diff --git a/Dockerfile.gpu b/Dockerfile.gpu index 9066135..09cbabc 100644 --- a/Dockerfile.gpu +++ b/Dockerfile.gpu @@ -1,4 +1,24 @@ -FROM nvidia/cuda:12.4.1-devel-ubuntu20.04 +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 as builder + +# FIXME: should be able to find this dynamically +ARG GPU_ARCH=89 + +# GPU_ARCH and USEGPU environment variable used by build_proc.sh +ENV FFTW_LIB=/usr/lib/x86_64-linux-gnu/libfftw3f.a +ENV GPU_ARCH=${GPU_ARCH} +ENV USEGPU=true +ENV DEBIAN_FRONTEND=noninteractive + +# FIXME: can remove git after switch back to released version of back-projection +RUN apt-get update && apt-get install -y --no-install-recommends unzip vim curl git build-essential gfortran libfftw3-dev && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN git clone -b main https://github.com/ASFHyP3/back-projection.git +COPY . /hyp3-back-projection/ +COPY ./scripts/build_proc.sh ./back-projection +RUN cd /back-projection && ./build_proc.sh && cd / + +FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 as runner # For opencontainers label definitions, see: # https://github.com/opencontainers/image-spec/blob/master/annotations.md @@ -11,32 +31,21 @@ LABEL org.opencontainers.image.url="https://github.com/ASFHyP3/hyp3-back-project LABEL org.opencontainers.image.source="https://github.com/ASFHyP3/hyp3-back-projection" LABEL org.opencontainers.image.documentation="https://hyp3-docs.asf.alaska.edu" -ARG DEBIAN_FRONTEND=noninteractive ARG CONDA_UID=1000 ARG CONDA_GID=1000 -ARG BACK_PROJECTION_TAG=0.2.0 -ARG FFTW_TAG=3.3.9 ARG MINIFORGE_NAME=Miniforge3 ARG MINIFORGE_VERSION=24.3.0-0 -# USEGPU environment variable used by build_proc.sh -ENV USEGPU="true" ENV CONDA_DIR=/opt/conda ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 ENV PATH=${CONDA_DIR}/bin:${PATH} ENV PYTHONDONTWRITEBYTECODE=true -ENV PROC_HOME=/home/conda/back-projection +ENV PROC_HOME=/back-projection ENV MYHOME=/home/conda +ENV DEBIAN_FRONTEND=noninteractive # Conda setup -RUN apt-get update > /dev/null && \ - apt-get install --no-install-recommends --yes \ - wget bzip2 ca-certificates \ - git \ - tini \ - > /dev/null && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ +RUN apt-get update && apt-get install --no-install-recommends --yes wget bzip2 ca-certificates git > /dev/null && \ wget --no-hsts --quiet https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/${MINIFORGE_NAME}-${MINIFORGE_VERSION}-Linux-$(uname -m).sh -O /tmp/miniforge.sh && \ /bin/bash /tmp/miniforge.sh -b -p ${CONDA_DIR} && \ rm /tmp/miniforge.sh && \ @@ -47,7 +56,7 @@ RUN apt-get update > /dev/null && \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> /etc/skel/.bashrc && \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc -RUN apt-get update && apt-get install -y --no-install-recommends unzip vim curl build-essential gfortran libfftw3-dev && \ +RUN apt-get install -y --no-install-recommends unzip vim curl gfortran && \ apt-get clean && rm -rf /var/lib/apt/lists/* RUN groupadd -g "${CONDA_GID}" --system conda && \ @@ -61,20 +70,8 @@ SHELL ["/bin/bash", "-l", "-c"] USER ${CONDA_UID} WORKDIR /home/conda/ -RUN curl -sL https://github.com/ASFHyP3/back-projection/archive/refs/tags/v${BACK_PROJECTION_TAG}.tar.gz > ./back-projection.tar.gz && \ - mkdir -p ./back-projection && \ - tar -xvf ./back-projection.tar.gz -C ./back-projection/ --strip=1 && \ - rm ./back-projection.tar.gz && \ - rm -rf ./back-projection/fft - -COPY --chown=${CONDA_UID}:${CONDA_GID} ./scripts/build_proc.sh ./back-projection -RUN cd /home/conda/back-projection && \ - chmod +x ./build_proc.sh && \ - ./build_proc.sh && \ - find $PROC_HOME -type f -name "*.py" -exec chmod +x {} + && \ - cd /home/conda/ - -COPY --chown=${CONDA_UID}:${CONDA_GID} . /hyp3-back-projection/ +COPY --chown=${CONDA_UID}:${CONDA_GID} --from=builder /back-projection /back-projection +COPY --chown=${CONDA_UID}:${CONDA_GID} --from=builder /hyp3-back-projection /hyp3-back-projection RUN mamba env create -f /hyp3-back-projection/environment.yml && \ conda clean -afy && \ diff --git a/README.md b/README.md index 933c3f2..85111e8 100644 --- a/README.md +++ b/README.md @@ -58,29 +58,18 @@ The process is different for different OS's and Linux distros. The setup process can be found [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuration). Make sure to follow the [Docker configuration steps](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuration) after installing the package. ### EC2 Setup +> [!CAUTION] +> Running the docker container on an Amazon Linux 2023 Deep Learning AMI runs, but will result in all zero outputs. Work is ongoing to determine what is causing this issue. For now, we recommend using option 2.i. + When running on an EC2 instance, the following setup is recommended: -1. Create a [P3-family EC2 instance](https://aws.amazon.com/ec2/instance-types/p3/) with the [Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver](https://aws.amazon.com/marketplace/pp/prodview-64e4rx3h733ru?sr=0-4&ref_=beagle&applicationId=AWSMPContessa) -2. Install Docker and the nvidia-container-toolkit on the EC2 instance: -```bash -sudo yum-config-manager --disable amzn2-graphics -curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo -sudo yum install docker -y -sudo yum install nvidia-container-toolkit -y -sudo yum-config-manager --enable amzn2-graphics -``` -3. Optionally, set up Docker to not require `sudo` and to start when the EC2 instance starts -```bash -sudo systemctl start docker && \ -sudo usermod -a -G docker ec2-user && \ -sudo systemctl enable docker -``` -4. Exit the EC2 instance and re-enter -5. To test the GPU setup, run the base NVIDIA container: -```bash -docker run -it --gpus all nvidia/cuda:12.4.1-devel-ubuntu20.04 nvidia-smi -``` -6. Build the actual container and run it: +1. Create a [G6-family EC2 instance](https://aws.amazon.com/ec2/instance-types/g6/) that has **at least 32 GB of memory**. +2. Launch your instance with one of the following setups (**option i is recommended**): + 1. Use the latest [Amazon Linux 2023 AMI](https://docs.aws.amazon.com/linux/al2023/ug/ec2.html) with `scripts/amazon_linux_setup.sh` as the [user script on launch](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html). + 2. Use the latest [Ubuntu AMI](https://cloud-images.ubuntu.com/locator/ec2/) with the `scripts/ubuntu_setup.sh` as the [user script on launch](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html). + 3. Use the [Ubuntu Deep Learning Base OSS Nvidia Driver GPU AMI](https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-22-04/) (no install script required). +3. Build the GPU docker container with the correct compute capability version. To determine this value, run `nvidia-smi` on the instance to obtain GPU type, then cross-reference this information with NVIDIA's [GPU type compute capability list](https://developer.nvidia.com/cuda-gpus). For a g6.2xlarge instance, this would be: ```bash -docker build -t back-projection:gpu -f Dockerfile.gpu . -docker run --gpus=all --rm -it back-projection:gpu ++process back_projection --help +docker --build-arg="GPU_ARCH=89" -t back-projection:gpu-89 -f Dockerfile.gpu . ``` +The compute capability version will always be the same for a given instance type, so you will only need to look this up once per instance type. +The default value for this argument is `89` - the correct value for g6.2xlarge instances. diff --git a/scripts/amazon_linux_setup.sh b/scripts/amazon_linux_setup.sh new file mode 100755 index 0000000..0ee262a --- /dev/null +++ b/scripts/amazon_linux_setup.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# GPU setup for the Amazon Linux 2023 + +# Install NVIDIA driver +DRIVER_VERSION=550.54.14 +sudo dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r) kernel-modules-extra +curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run +chmod +x NVIDIA-Linux-x86_64-$DRIVER_VERSION.run +sudo ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --tmpdir . --silent +rm ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run + +# Install and enable Docker +sudo dnf install -y docker git +sudo systemctl start docker +sudo systemctl enable docker +sudo usermod -aG docker ec2-user + +# Install nvidia-container-toolkit +sudo dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo +sudo dnf install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker + +# Install extra packages +sudo dnf install -y git + +# Cleanup +dnf clean all && rm -rf /var/cache/dnf/* + +# Reboot +sudo reboot diff --git a/scripts/build_proc.sh b/scripts/build_proc.sh old mode 100644 new mode 100755 index 08e080e..3abd0a5 --- a/scripts/build_proc.sh +++ b/scripts/build_proc.sh @@ -1,10 +1,13 @@ #!/bin/bash -MULTIARCH_DIR=/usr/lib/$(gcc -print-multiarch) -FFTW_LIB=$MULTIARCH_DIR/libfftw3f.a +# Keeping these lines here in case we need to switch back to grabbing the FFTW location +# dynamically again +# MULTIARCH_DIR=/usr/lib/$(gcc -print-multiarch) +# FFTW_LIB=$MULTIARCH_DIR/libfftw3f.a echo 'using FFTW library:' $FFTW_LIB if [[ "$USEGPU" == "true" ]]; then - echo 'building with GPU support' + nvcc -o gpu_arch gpu_arch.cu + echo 'building with GPU support, capability version' $GPU_ARCH fi cd DEM @@ -21,11 +24,6 @@ gfortran -c processsubcpu.f90 backprojectcpusub.f90 bounds.f90 orbitrangetime.f9 gcc -o sentinel_raw_process_cpu sentinel_raw_process_cpu.o decode_line_memory.o processsubcpu.o backprojectcpusub.o azimuth_compress_cpu.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lgfortran -lgomp -lm -lrt -lpthread echo 'built sentinel_raw_process_cpu' -if [[ "$USEGPU" == "true" ]]; then - nvcc -o howmanygpus howmanygpus.cu - echo 'built howmanygpus' -fi - cd geo2rdr gfortran -o estimatebaseline estimatebaseline.f90 intp_orbit.f90 latlon.f90 orbithermite.f -ffixed-line-length-none @@ -72,7 +70,6 @@ gfortran -o psinterp psinterp.f90 -fopenmp echo 'Built cosine_sim and psinterp in ps directory' cd .. -tar xf snaphu_v2_0b0_0_0.tar cd snaphu_v2.0b0.0.0/src make CFLAGS=-O3 -s @@ -89,14 +86,13 @@ gcc -c filelen.c io.c sentinel_raw_process.c decode_line_memory.c -lm -fopenmp echo 'built raw_process components in sentinel' -if [[ "$USEGPU" == "true" ]]; then - nvcc -gencode arch=compute_89,code=sm_89 -c azimuth_compress.cu -Wno-deprecated-gpu-targets -fi - gfortran -c processsub.f90 backprojectgpusub.f90 bounds.f90 orbitrangetime.f90 latlon.f90 intp_orbit.f90 radar_to_xyz.f90 unitvec.f90 tcnbasis.f90 curvature.f90 cross.f90 orbithermite.f sentineltimingsub.f90 getburststatevectors.f90 -ffixed-line-length-none -fopenmp if [[ "$USEGPU" == "true" ]]; then - nvcc -o sentinel_raw_process sentinel_raw_process.o decode_line_memory.o processsub.o backprojectgpusub.o azimuth_compress.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lstdc++ -lgfortran -lgomp + nvcc -o howmanygpus howmanygpus.cu + nvcc -gencode arch=compute_$GPU_ARCH,code=sm_$GPU_ARCH -c azimuth_compress.cu -Wno-deprecated-gpu-targets + nvcc -gencode arch=compute_$GPU_ARCH,code=sm_$GPU_ARCH -o sentinel_raw_process sentinel_raw_process.o decode_line_memory.o processsub.o backprojectgpusub.o azimuth_compress.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lstdc++ -lgfortran -lgomp + echo 'built gpu components components in sentinel' fi cd .. diff --git a/scripts/ubuntu_setup.sh b/scripts/ubuntu_setup.sh new file mode 100755 index 0000000..0bba522 --- /dev/null +++ b/scripts/ubuntu_setup.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# GPU setup for the Ubuntu 22.04 + +# NVIDIA source setup +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \ +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ +sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ +sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list && \ +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \ +sudo dpkg -i cuda-keyring_1.1-1_all.deb && \ +rm cuda-keyring_1.1-1_all.deb + +# Docker source setup +sudo apt install -y ca-certificates curl gnupg lsb-release && \ +sudo mkdir -p /etc/apt/keyrings && \ +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ +echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + +# Installs +sudo apt-get update && \ +sudo apt-get install -y nvidia-headless-535-server nvidia-utils-535-server nvidia-container-toolkit docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin awscli git && \ +sudo usermod -aG docker ubuntu + +# Cleanup temporary files +sudo apt-get clean +sudo rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Reboot +sudo reboot diff --git a/src/hyp3_back_projection/back_projection.py b/src/hyp3_back_projection/back_projection.py index d702eef..b4fc721 100644 --- a/src/hyp3_back_projection/back_projection.py +++ b/src/hyp3_back_projection/back_projection.py @@ -140,8 +140,8 @@ def back_project( utils.call_stanford_module('util/merge_slcs.py', work_dir=work_dir) + zip_path = create_product(work_dir) if bucket: - zip_path = create_product(work_dir) upload_file_to_s3(zip_path, bucket, bucket_prefix) print(f'Finished back-projection for {list(work_dir.glob("S1*.geo"))[0].with_suffix("").name}!')