From e2f3831c387a2e4684f44c34c1a27c4ed9c0953d Mon Sep 17 00:00:00 2001 From: ranchodeluxe Date: Mon, 4 Mar 2024 06:55:05 -0800 Subject: [PATCH 1/7] heap allocations --- .github/workflows/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index c387122..e73857e 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -80,8 +80,8 @@ def calc_task_manager_resources(task_manager_process_memory): # calculate dynamic values return { "total_flink": int(total_flink_memory), - "task_heap": int(remaining_memory * 0.90), - "task_off_heap": int(remaining_memory * 0.10), + "task_heap": int(remaining_memory * 0.75), + "task_off_heap": int(remaining_memory * 0.25), "task_memory_managed_fraction": managed_memory_ratio } From d5d52b9872ad112123ffd374d491000421f7536e Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Mon, 4 Mar 2024 14:36:33 -0800 Subject: [PATCH 2/7] Update job-runner.yaml --- .github/workflows/job-runner.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 42332b2..b81d332 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -50,6 +50,7 @@ jobs: if: contains('["ranchodeluxe","abarciauskas-bgse", "norlandrhagen", "sharkinsspatial", "moradology", "thodson-usgs"]', github.actor) name: kickoff job ${{ needs.name-job.outputs.repo_name }}@${{ github.event.inputs.ref }} needs: name-job + environment: veda-smce outputs: job_name: ${{ steps.report_ids.outputs.job_name }} job_id: ${{ steps.report_ids.outputs.job_id }} @@ -94,7 +95,7 @@ jobs: - name: update kubeconfig with cluster run: | - aws eks update-kubeconfig --name pangeo-forge-v3 --region ${{ secrets.GH_ACTIONS_AWS_REGION }} + aws eks update-kubeconfig --name $EKS_CLUSTER_NAME --region ${{ secrets.GH_ACTIONS_AWS_REGION }} - name: execute recipe on k8s cluster id: executejob From 3e6a895dd95a8f078600916a6d53172dc2106485 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Mon, 4 Mar 2024 14:39:19 -0800 Subject: [PATCH 3/7] Update config.py --- .github/workflows/config.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/.github/workflows/config.py b/.github/workflows/config.py index e73857e..ce5584d 100644 --- a/.github/workflows/config.py +++ b/.github/workflows/config.py @@ -1,17 +1,5 @@ import os -bucket_choice = os.environ.get("S3_BUCKET") -bucket_options = { - "default": "s3://gcorradini-forge-runner-test", - "test": "s3://gcorradini-forge-runner-test", -} -s3_uri = bucket_options.get(bucket_choice) -if not s3_uri: - raise ValueError( - f"'S3_BUCKET_OPTIONS_MAP' did not have a key for '{bucket_choice}'. Options are {bucket_options}" - ) - - def calc_task_manager_resources(task_manager_process_memory): """ illustration of Flink memory model: @@ -104,8 +92,6 @@ def calc_task_manager_resources(task_manager_process_memory): ) print(f"[ CALCULATED TASK MANAGER RESOURCES ]: {task_manager_resources}") - -BUCKET_PREFIX = s3_uri c.Bake.prune = bool(int(os.environ.get("PRUNE_OPTION"))) c.Bake.container_image = "apache/beam_python3.10_sdk:2.52.0" c.Bake.bakery_class = "pangeo_forge_runner.bakery.flink.FlinkOperatorBakery" @@ -127,11 +113,10 @@ def calc_task_manager_resources(task_manager_process_memory): "taskmanager.memory.managed.fraction": f"{task_manager_resources['task_memory_managed_fraction']}" } +BUCKET_PREFIX = os.environ.get("OUTPUT_BUCKET") c.TargetStorage.fsspec_class = "s3fs.S3FileSystem" c.TargetStorage.root_path = f"{BUCKET_PREFIX}/{{job_name}}/output" c.TargetStorage.fsspec_args = { - "key": os.environ.get("S3_DEFAULT_AWS_ACCESS_KEY_ID"), - "secret": os.environ.get("S3_DEFAULT_AWS_SECRET_ACCESS_KEY"), "anon": False, "client_kwargs": {"region_name": "us-west-2"}, } From 795efef9bd5078ab847b42e1347c4f9ebac64274 Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Mon, 4 Mar 2024 14:47:18 -0800 Subject: [PATCH 4/7] Update job-runner.yaml --- .github/workflows/job-runner.yaml | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index b81d332..151c822 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -14,22 +14,18 @@ on: description: 'The subdir of the feedstock directory in the repo' required: true default: 'feedstock' - bucket: - description: 'This job runner leverages s3fs.S3FileSystem for your recipe cache and output. Choices currently are: "default"' - required: true - default: 'default' prune: description: 'Only run the first two time steps' required: true default: '0' parallelism: - description: 'Number of task managers to spin up' + description: 'Number of workers to run in parallel' required: true default: '1' - protocol: - description: 'What protocol to use when accessing files (s3 or https).' + auth_mode: + description: 'What auth mode (edl or iamrole) to use when accessing files.' required: false - default: 's3' + default: 'iamrole' resource_profile: description: 'jobs have different memory requirements so choose (small[7168M], medium[10240M], large[15360M], xlarge[20480M])' required: false @@ -71,7 +67,6 @@ jobs: echo "Manually triggered workflow: \ ${{ github.event.inputs.repo }} \ ${{ github.event.inputs.ref }} \ - ${{ github.event.inputs.bucket }} \ ${{ github.event.inputs.parallelism }} \ ${{ github.event.inputs.prune }}" @@ -123,7 +118,6 @@ jobs: FLINK_DASH=$(cat execute.log | grep -oP "You can run '\K[^']+(?=')") echo "FLINK_DASH=$FLINK_DASH" >> $GITHUB_ENV env: - EARTHDATA_TOKEN: ${{ secrets.EARTHDATA_TOKEN }} EARTHDATA_USERNAME: ${{ secrets.EARTHDATA_USERNAME }} EARTHDATA_PASSWORD: ${{ secrets.EARTHDATA_PASSWORD }} REPO: ${{ github.event.inputs.repo }} @@ -131,10 +125,8 @@ jobs: FEEDSTOCK_SUBDIR: ${{ github.event.inputs.feedstock_subdir }} PRUNE_OPTION: ${{ github.event.inputs.prune }} PARALLELISM_OPTION: ${{ github.event.inputs.parallelism }} - S3_BUCKET: ${{ github.event.inputs.bucket }} - S3_DEFAULT_AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEFAULT_AWS_ACCESS_KEY_ID }} - S3_DEFAULT_AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEFAULT_AWS_SECRET_ACCESS_KEY }} - PROTOCOL: ${{ github.event.inputs.protocol }} + OUTPUT_BUCKET: $OUTPUT_BUCKET + AUTH_MODE: ${{ github.event.inputs.auth_mode }} RESOURCE_PROFILE: ${{ github.event.inputs.resource_profile }} - name: cleanup if "pangeo-forge-runner bake" failed @@ -214,7 +206,7 @@ jobs: - name: update kubeconfig with cluster run: | - aws eks update-kubeconfig --name pangeo-forge-v3 --region ${{ secrets.GH_ACTIONS_AWS_REGION }} + aws eks update-kubeconfig --name $EKS_CLUSTER_NAME --region ${{ secrets.GH_ACTIONS_AWS_REGION }} # - name: Setup upterm session # uses: lhotari/action-upterm@v1 From bec22de6cdf4437d7965baee6d522efabaa5a9fd Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Tue, 5 Mar 2024 11:56:26 -0800 Subject: [PATCH 5/7] Update job-runner.yaml --- .github/workflows/job-runner.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 151c822..f5ee738 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -90,7 +90,7 @@ jobs: - name: update kubeconfig with cluster run: | - aws eks update-kubeconfig --name $EKS_CLUSTER_NAME --region ${{ secrets.GH_ACTIONS_AWS_REGION }} + aws eks update-kubeconfig --name ${{ vars.EKS_CLUSTER_NAME }} --region ${{ secrets.GH_ACTIONS_AWS_REGION }} - name: execute recipe on k8s cluster id: executejob @@ -125,7 +125,7 @@ jobs: FEEDSTOCK_SUBDIR: ${{ github.event.inputs.feedstock_subdir }} PRUNE_OPTION: ${{ github.event.inputs.prune }} PARALLELISM_OPTION: ${{ github.event.inputs.parallelism }} - OUTPUT_BUCKET: $OUTPUT_BUCKET + OUTPUT_BUCKET: ${{ vars.OUTPUT_BUCKET }} AUTH_MODE: ${{ github.event.inputs.auth_mode }} RESOURCE_PROFILE: ${{ github.event.inputs.resource_profile }} @@ -206,7 +206,7 @@ jobs: - name: update kubeconfig with cluster run: | - aws eks update-kubeconfig --name $EKS_CLUSTER_NAME --region ${{ secrets.GH_ACTIONS_AWS_REGION }} + aws eks update-kubeconfig --name ${{ vars.EKS_CLUSTER_NAME }} --region ${{ secrets.GH_ACTIONS_AWS_REGION }} # - name: Setup upterm session # uses: lhotari/action-upterm@v1 From 69453453efcc2c72e90dd6b42b9e16e00d82805d Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Tue, 5 Mar 2024 15:14:29 -0800 Subject: [PATCH 6/7] Update job-runner.yaml --- .github/workflows/job-runner.yaml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index f5ee738..111ab80 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -31,6 +31,10 @@ on: required: false default: 'small' +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + jobs: name-job: runs-on: ubuntu-latest @@ -56,6 +60,12 @@ jobs: - name: checkout repository uses: actions/checkout@v3 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc + role-session-name: veda-pforge-run-job + aws-region: us-west-2 - name: set up python 3.10 uses: actions/setup-python@v3 @@ -191,12 +201,12 @@ jobs: name: monitor job ${{ needs.name-job.outputs.repo_name }}@${{ github.event.inputs.ref }} needs: [name-job, run-job] steps: - - name: set up aws credentials for job runner user + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v2 with: - aws-access-key-id: ${{ secrets.GH_ACTIONS_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.GH_ACTIONS_AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.GH_ACTIONS_AWS_REGION }} + role-to-assume: arn:aws:iam::444055461661:role/github-actions-role-eodc + role-session-name: veda-pforge-monitor-job + aws-region: us-west-2 - name: install kubectl run: | From 479b6714cf5f1850e83626aa332b564f076eabaa Mon Sep 17 00:00:00 2001 From: Aimee Barciauskas Date: Tue, 5 Mar 2024 16:20:33 -0800 Subject: [PATCH 7/7] Update job-runner.yaml --- .github/workflows/job-runner.yaml | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml index 111ab80..e938877 100644 --- a/.github/workflows/job-runner.yaml +++ b/.github/workflows/job-runner.yaml @@ -85,13 +85,6 @@ jobs: python -m pip install --upgrade pip pip install pangeo-forge-runner>=0.10.0 - - name: set up aws credentials for job runner user - uses: aws-actions/configure-aws-credentials@v2 - with: - aws-access-key-id: ${{ secrets.GH_ACTIONS_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.GH_ACTIONS_AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.GH_ACTIONS_AWS_REGION }} - - name: install kubectl run: | curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" @@ -100,7 +93,7 @@ jobs: - name: update kubeconfig with cluster run: | - aws eks update-kubeconfig --name ${{ vars.EKS_CLUSTER_NAME }} --region ${{ secrets.GH_ACTIONS_AWS_REGION }} + aws eks update-kubeconfig --name ${{ vars.EKS_CLUSTER_NAME }} --region us-west-2 - name: execute recipe on k8s cluster id: executejob @@ -216,7 +209,7 @@ jobs: - name: update kubeconfig with cluster run: | - aws eks update-kubeconfig --name ${{ vars.EKS_CLUSTER_NAME }} --region ${{ secrets.GH_ACTIONS_AWS_REGION }} + aws eks update-kubeconfig --name ${{ vars.EKS_CLUSTER_NAME }} --region us-west-2 # - name: Setup upterm session # uses: lhotari/action-upterm@v1