From a1b0558493930dca4f5c86db658ad295b9beabd6 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 19 Dec 2024 11:04:17 +0100 Subject: [PATCH] fast import: importer: use aws s3 cli (#10162) ## Problem s5cmd doesn't pick up the pod service account ``` 2024/12/16 16:26:01 Ignoring, HTTP credential provider invalid endpoint host, "169.254.170.23", only loopback hosts are allowed. ERROR "ls s3://neon-dev-bulk-import-us-east-2/import-pgdata/fast-import/v1/br-wandering-hall-w2xobawv": NoCredentialProviders: no valid providers in chain. Deprecated. For verbose messaging see aws.Config.CredentialsChainVerboseErrors ``` ## Summary of changes Switch to offical CLI. ## Testing Tested the pre-merge image in staging, using `job_image` override in project settings. https://neondb.slack.com/archives/C033RQ5SPDH/p1734554944391949?thread_ts=1734368383.258759&cid=C033RQ5SPDH ## Future Work Switch back to s5cmd once https://github.com/peak/s5cmd/pull/769 gets merged. ## Refs - fixes https://github.com/neondatabase/cloud/issues/21876 --------- Co-authored-by: Gleb Novikov --- compute/compute-node.Dockerfile | 24 ++++++++++--------- compute_tools/src/bin/fast_import.rs | 10 ++++---- .../fast_import/{s5cmd.rs => aws_s3_sync.rs} | 13 ++++------ 3 files changed, 23 insertions(+), 24 deletions(-) rename compute_tools/src/bin/fast_import/{s5cmd.rs => aws_s3_sync.rs} (50%) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 9f1f3b734363..5e7b4e8287ce 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1556,28 +1556,30 @@ RUN apt update && \ locales \ procps \ ca-certificates \ + curl \ + unzip \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2 -# used by fast_import +# aws cli is used by fast_import (curl and unzip above are at this time only used for this installation step) ARG TARGETARCH -ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb RUN set -ex; \ - \ - # Determine the expected checksum based on TARGETARCH if [ "${TARGETARCH}" = "amd64" ]; then \ - CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \ + TARGETARCH_ALT="x86_64"; \ + CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \ elif [ "${TARGETARCH}" = "arm64" ]; then \ - CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \ + TARGETARCH_ALT="aarch64"; \ + CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \ else \ echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ fi; \ - \ - # Compute and validate the checksum - echo "${CHECKSUM} /tmp/s5cmd.deb" | sha256sum -c - -RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb + curl -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ + echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ + unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ + /tmp/awscliv2/aws/install; \ + rm -rf /tmp/awscliv2.zip /tmp/awscliv2; \ + true ENV LANG=en_US.utf8 USER postgres diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index b6db3eb11abc..793ec4cf1094 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -34,12 +34,12 @@ use nix::unistd::Pid; use tracing::{info, info_span, warn, Instrument}; use utils::fs_ext::is_directory_empty; +#[path = "fast_import/aws_s3_sync.rs"] +mod aws_s3_sync; #[path = "fast_import/child_stdio_to_log.rs"] mod child_stdio_to_log; #[path = "fast_import/s3_uri.rs"] mod s3_uri; -#[path = "fast_import/s5cmd.rs"] -mod s5cmd; #[derive(clap::Parser)] struct Args { @@ -326,7 +326,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { } info!("upload pgdata"); - s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/")) + aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/")) .await .context("sync dump directory to destination")?; @@ -334,10 +334,10 @@ pub(crate) async fn main() -> anyhow::Result<()> { { let status_dir = working_directory.join("status"); std::fs::create_dir(&status_dir).context("create status directory")?; - let status_file = status_dir.join("status"); + let status_file = status_dir.join("pgdata"); std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) .context("write status file")?; - s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata")) + aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/")) .await .context("sync status directory to destination")?; } diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs similarity index 50% rename from compute_tools/src/bin/fast_import/s5cmd.rs rename to compute_tools/src/bin/fast_import/aws_s3_sync.rs index d2d9a79736fe..5fa58c8f875f 100644 --- a/compute_tools/src/bin/fast_import/s5cmd.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -4,24 +4,21 @@ use camino::Utf8Path; use super::s3_uri::S3Uri; pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { - let mut builder = tokio::process::Command::new("s5cmd"); - // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL - if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") { - builder.arg("--endpoint-url").arg(val); - } + let mut builder = tokio::process::Command::new("aws"); builder + .arg("s3") .arg("sync") .arg(local.as_str()) .arg(remote.to_string()); let st = builder .spawn() - .context("spawn s5cmd")? + .context("spawn aws s3 sync")? .wait() .await - .context("wait for s5cmd")?; + .context("wait for aws s3 sync")?; if st.success() { Ok(()) } else { - Err(anyhow::anyhow!("s5cmd failed")) + Err(anyhow::anyhow!("aws s3 sync failed")) } }