Merge pull request #34 from teamdatatonic/feat/tutorial_notebook

Feat/tutorial notebook TT-85
teamdatatonic · Nov 13, 2023 · ee124f2 · ee124f2
2 parents 90e2af8 + 6c7145b
commit ee124f2
Show file tree

Hide file tree

Showing 47 changed files with 4,345 additions and 2,817 deletions.
diff --git a/Makefile b/Makefile
@@ -24,27 +24,28 @@ pre-commit: ## Runs the pre-commit checks over entire repo
 	poetry run pre-commit run --all-files
 
 env ?= dev
+AUTO_APPROVE_FLAG :=
 deploy: ## Deploy the Terraform infrastructure to your project. Requires VERTEX_PROJECT_ID and VERTEX_LOCATION env variables to be set in env.sh. Optionally specify env=<dev|test|prod> (default = dev)
-	@ cd terraform/envs/$(env) && \
+	@if [ "$(auto-approve)" = "true" ]; then \
+		AUTO_APPROVE_FLAG="-auto-approve"; \
+	fi; \
+	cd terraform/envs/$(env) && \
 	terraform init -backend-config='bucket=${VERTEX_PROJECT_ID}-tfstate' && \
-	terraform apply -var 'project_id=${VERTEX_PROJECT_ID}' -var 'region=${VERTEX_LOCATION}'
+	terraform apply -var 'project_id=${VERTEX_PROJECT_ID}' -var 'region=${VERTEX_LOCATION}' $$AUTO_APPROVE_FLAG
 
 undeploy: ## DESTROY the Terraform infrastructure in your project. Requires VERTEX_PROJECT_ID and VERTEX_LOCATION env variables to be set in env.sh. Optionally specify env=<dev|test|prod> (default = dev)
-	@ cd terraform/envs/$(env) && \
+	@if [ "$(auto-approve)" = "true" ]; then \
+		AUTO_APPROVE_FLAG="-auto-approve"; \
+	fi; \
+	cd terraform/envs/$(env) && \
 	terraform init -backend-config='bucket=${VERTEX_PROJECT_ID}-tfstate' && \
-	terraform destroy -var 'project_id=${VERTEX_PROJECT_ID}' -var 'region=${VERTEX_LOCATION}'
+	terraform destroy -var 'project_id=${VERTEX_PROJECT_ID}' -var 'region=${VERTEX_LOCATION}' $$AUTO_APPROVE_FLAG
 
 install: ## Set up local environment for Python development on pipelines
 	@cd pipelines && \
 	poetry install --with dev && \
-	cd .. && \
-	for component_group in components/*/ ; do \
-		echo "Setup for $$component_group" && \
-		cd "$$component_group" && \
-		poetry install --with dev && \
-		cd ../.. ;\
-	done ; \
-
+	cd ../components && \
+	poetry install --with dev
 
 compile: ## Compile the pipeline to pipeline.yaml. Must specify pipeline=<training|prediction>
 	@cd pipelines/src && \
@@ -82,20 +83,17 @@ run: ## Run pipeline in sandbox environment. Must specify pipeline=<training|pre
 	cd pipelines/src && \
 	poetry run python -m pipelines.utils.trigger_pipeline --template_path=pipelines/${pipeline}/pipeline.yaml --display_name=${pipeline} --wait=${wait}
 
-test: ## Run unit tests for a specific component group or for all component groups and the pipeline trigger code. Optionally specify GROUP=<component group e.g. vertex-components>
-	@if [ -n "${GROUP}" ]; then \
-		echo "Test components under components/${GROUP}" && \
-		cd components/${GROUP} && \
-		poetry run pytest ; \
-	else \
-		echo "Testing scripts" && \
-		cd pipelines && \
-		poetry run python -m pytest tests/utils &&\
-		cd .. && \
-		for i in components/*/ ; do \
-			echo "Test components under $$i" && \
-			cd "$$i" && \
-			poetry run pytest && \
-			cd ../.. ;\
-		done ; \
-	fi
+components ?= true
+test: ## Run unit tests. Specify components=<true|false> to test scripts and optionally components
+	@if [ $(components) = "true" ]; then \
+		echo "Testing components" && \
+		cd components && \
+		poetry run pytest && \
+		cd .. ;  \
+	elif [ $(components) != "false" ]; then \
+		echo "ValueError: components must be either true or false" ; \
+		exit ; \
+	fi && \
+	echo "Testing scripts" && \
+	cd pipelines && \
+	poetry run python -m pytest tests/utils
diff --git a/README.md b/README.md
@@ -21,13 +21,17 @@ _AKA "Vertex AI Turbo Templates"_
 ## Introduction
 
 This repository provides a reference implementation of [Vertex Pipelines](https://cloud.google.com/vertex-ai/docs/pipelines/) for creating a production-ready MLOps solution on Google Cloud.
-You can take this repository as a starting point you own ML use cases. The implementation includes:
+You can take this repository as a starting point you own ML use cases. 
+The implementation includes:
 
-* Infrastructure-as-Code using Terraform for a typical dev/test/prod setup of Vertex AI and other relevant services
-* ML training and batch prediction pipelines using the Kubeflow Pipelines SDK for an example use case (using the [Chicago Taxi Trips Dataset](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=chicago_taxi_trips&page=dataset))
-* Reusable KFP components that can be used in ML pipelines
-* CI/CD using Google Cloud Build for linting, testing, and deploying ML pipelines
-* Developer scripts (Makefile, Python scripts etc)
+* **Infrastructure-as-Code** using Terraform for a typical dev/test/prod setup of Vertex AI and other relevant services
+* **ML training and prediction pipelines** using the Kubeflow Pipelines
+* **Reusable Kubeflow components** that can be used in common ML pipelines
+* **CI/CD** using Google Cloud Build for linting, testing, and deploying ML pipelines
+* **Developer scripts** (Makefile, Python scripts etc.)
+
+**Get started today by following [this step-by-step notebook tutorial](docs/notebooks)! 🚀**
+In this three-part notebook series you'll deploy a Google Cloud project and run production-ready ML pipelines using Vertex AI without writing a single line of code.
 
 ## Cloud Architecture
 
@@ -42,220 +46,116 @@ There are four different Google Cloud projects in use
 * `prod` - production environment
 * `admin` - separate Google Cloud project for setting up CI/CD in Cloud Build (since the CI/CD pipelines operate across the different environments)
 
-Vertex Pipelines are scheduled using Google Cloud Scheduler. Cloud Scheduler emits a Pub/Sub message that triggers a Cloud Function, which in turn triggers the Vertex Pipeline to run. _In future, this will be replaced with the Vertex Pipelines Scheduler (once there is a Terraform resource for it)._
-
-## Infrastructure
-
-The cloud infrastructure is managed using Terraform and is defined in the [`terraform`](terraform) directory. There are three Terraform modules defined in [`terraform/modules`](terraform/modules):
-
-- `cloudfunction` - deploys a (Pub/Sub-triggered) Cloud Function from local source code
-- `scheduled_pipelines` - deploys Cloud Scheduler jobs that will trigger Vertex Pipeline runs (via the above Cloud Function)
-- `vertex_deployment` - deploys Cloud infrastructure required for running Vertex Pipelines, including enabling APIs, creating buckets, Artifact Registry repos, service accounts, and IAM permissions.
+Vertex Pipelines are scheduled using Google Cloud Scheduler. 
+Cloud Scheduler emits a Pub/Sub message that triggers a Cloud Function, which in turn triggers the Vertex Pipeline to run. 
+_In future, this will be replaced with the Vertex Pipelines Scheduler (once there is a Terraform resource for it)._
 
-There is a Terraform configuration for each environment (dev/test/prod) under [`terraform/envs`](terraform/envs/).
+## Setup
 
-How to deploy this infrastructure is covered in a [later section](#deploying-infrastructure).
-
-## Developer setup
-
-### Prerequisites
+**Prerequisites:**
 
+- [Terraform](https://www.terraform.io/) for managing cloud infrastructure
+- [tfswitch](https://tfswitch.warrensbox.com/) to automatically choose and download an appropriate Terraform version (recommended) 
 - [Pyenv](https://github.com/pyenv/pyenv#installation) for managing Python versions
+- [Poetry](https://python-poetry.org/) for managing Python dependencies
 - [Google Cloud SDK (gcloud)](https://cloud.google.com/sdk/docs/quickstart)
 - Make
+- Cloned repo
 
-### Local setup
-
-1. Clone the repository locally (or create a new repo from this template)
-1. Install the correct Python version: `pyenv install`
-1. Install poetry - follow the instructions in the [poetry documentation](https://python-poetry.org/docs/#installation)
-1. Configure poetry to use the Python version from pyenv: `poetry config virtualenvs.prefer-active-python true`
-1. Install poetry dependencies for ML pipelines: `make install`
-1. Install pre-commit hooks: `cd pipelines && poetry run pre-commit install`
-1. Copy `env.sh.example` to `env.sh`, and update the environment variables in `env.sh` for your dev environment (particularly `VERTEX_PROJECT_ID`, `VERTEX_LOCATION` and `RESOURCE_SUFFIX`)
-1. Authenticate to Google Cloud
-    1. `gcloud auth login`
-    1. `gcloud auth application-default login`
-
-### Deploying infrastructure
-
-You will need four Google Cloud projects:
-
-- dev
-- test
-- prod
-- admin
+**Deploy infrastructure:**
 
+You will need four Google Cloud projects dev, test, prod, and admin.
 The Cloud Build pipelines will run in the _admin_ project, and deploy resources into the dev/test/prod projects.
-
 Before your CI/CD pipelines can deploy the infrastructure, you will need to set up a Terraform state bucket for each environment:
 
 ```bash
-gsutil mb -l <GCP region e.g. europe-west2> -p <DEV PROJECT ID> --pap=enforced gs://<DEV PROJECT ID>-tfstate && gsutil ubla set on gs://<DEV PROJECT ID>-tfstate
-
-gsutil mb -l <GCP region e.g. europe-west2> -p <TEST PROJECT ID> --pap=enforced gs://<TEST PROJECT ID>-tfstate && gsutil ubla set on gs://<TEST PROJECT ID>-tfstate
-
-gsutil mb -l <GCP region e.g. europe-west2> -p <PROD PROJECT ID> --pap=enforced gs://<PROD PROJECT ID>-tfstate && gsutil ubla set on gs://<PROD PROJECT ID>-tfstate
+export DEV_PROJECT_ID=my-dev-gcp-project
+export DEV_LOCATION=europe-west2
+gsutil mb -l DEV_LOCATION -p DEV_PROJECT_ID --pap=enforced gs://DEV_PROJECT_ID-tfstate && \
+  gsutil ubla set on gs://DEV_PROJECT_ID-tfstate
 ```
 
-You will also need to manually enable the Cloud Resource Manager and Service Usage APs for your _admin_ project:
+Enable APIs in admin project:
 
 ```bash
-gcloud services enable cloudresourcemanager.googleapis.com --project=<ADMIN PROJECT ID>
-gcloud services enable serviceusage.googleapis.com --project=<ADMIN PROJECT ID>
+export ADMIN_PROJECT_ID=my-admin-gcp-project
+gcloud services enable cloudresourcemanager.googleapis.com serviceusage.googleapis.com --project=ADMIN_PROJECT_ID
 ```
 
-Install Terraform on your local machine. We recommend using [`tfswitch`](https://tfswitch.warrensbox.com/) to automatically choose and download an appropriate version for you (run `tfswitch` from the [`terraform/envs/dev`](terraform/envs/dev/) directory).
-
-Now you can deploy the infrastructure using Terraform:
-
 ```bash
 make deploy env=dev VERTEX_PROJECT_ID=<DEV PROJECT ID>
-make deploy env=test VERTEX_PROJECT_ID=<TEST PROJECT ID>
-make deploy env=prod VERTEX_PROJECT_ID=<PROD PROJECT ID>
 ```
 
-#### Optional - Tearing down infrastructure
+More details about infrastructure is explained in [this README](docs/INFRASTRUCTURE.md).
+It describes the scheduling of pipelines and how to tear down infrastructure.
 
-To tear down the infrastructure you have created with Terraform, run these commands:
+**Install dependencies:**
 
 ```bash
-make undeploy env=dev VERTEX_PROJECT_ID=<DEV PROJECT ID>
-make undeploy env=test VERTEX_PROJECT_ID=<TEST PROJECT ID>
-make undeploy env=prod VERTEX_PROJECT_ID=<PROD PROJECT ID>
+pyenv install -skip-existing                          # install Python
+poetry config virtualenvs.prefer-active-python true   # configure Poetry
+make install                                          # install Python dependencies
+cd pipelines && poetry run pre-commit install         # install pre-commit hooks
+cp env.sh.example env.sh
 ```
 
-### Example ML pipelines
+Update the environment variables for your dev environment in `env.sh`.
 
-This repository contains example ML training and prediction pipelines for scikit-learn/XGBoost using the popular [Chicago Taxi Dataset](https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips). The details of these can be found in the [separate README](pipelines/README.md).
+**Authenticate to Google Cloud:**
 
-#### Pre-requisites
-
-Before you can run these example pipelines successfully there are a few additional things you will need to deploy into each environment (they have not been included in the Terraform code as they are specific to these Chicago Taxi pipelines)
-
-1. Create a new BigQuery dataset for the Chicago Taxi data:
-
-```
-bq --location=${VERTEX_LOCATION} mk --dataset "${VERTEX_PROJECT_ID}:chicago_taxi_trips"
-```
-
-2. Create a new BigQuery dataset for data processing during the pipelines:
-
-```
-bq --location=${VERTEX_LOCATION} mk --dataset "${VERTEX_PROJECT_ID}:preprocessing"
-```
-
-3. Set up a BigQuery transfer job to mirror the Chicago Taxi dataset to your project
-
-```
-bq mk --transfer_config \
-  --project_id=${VERTEX_PROJECT_ID} \
-  --data_source="cross_region_copy" \
-  --target_dataset="chicago_taxi_trips" \
-  --display_name="Chicago taxi trip mirror" \
-  --params='{"source_dataset_id":"'"chicago_taxi_trips"'","source_project_id":"'"bigquery-public-data"'"}'
+```bash
+gcloud auth login
+gcloud auth application-default login
 ```
 
-### Building the container images
+## Run
 
-The [model/](/model/) directory contains the code for custom training and serving container images, including the model training script at [model/training/train.py](model/training/train.py). You can modify this to suit your own use case.
+This repository contains example ML training and prediction pipelines which are explained in [this README](docs/PIPELINES.md).
 
+**Build containers:** The [model/](/model/) directory contains the code for custom training and serving container images, including the model training script at [model/training/train.py](model/training/train.py). 
+You can modify this to suit your own use case.
 Build the training and serving container images and push them to Artifact Registry with:
 
 ```bash
-make build
+make build [ targets=training serving ]
 ```
 
-Optionally specify the `target` variable to only build one of the images. For example, to build only the serving image:
+Optionally specify the `targets` variable to only build one of the images.
 
-```bash
-make build target=serving
-```
+**Execute pipelines:** Vertex AI Pipelines uses KubeFlow to orchestrate your training steps, as such you'll need to:
 
-### Running Pipelines
+1. Compile the pipeline
+1. Build dependent Docker containers
+1. Run the pipeline in Vertex AI
 
-You can run the training pipeline (for example) with:
+Execute the following command to run through steps 1-3:
 
 ```bash
-make run pipeline=training [ wait=<true|false> ]
+make run pipeline=training [ wait=<true|false> ] [ build=<true|false> ] [ compile=<true|false> ]
 ```
 
-This will execute the pipeline using the chosen template on Vertex AI, namely it will:
+The command has the following true/false flags:
 
-1. Compile the pipeline using the Kubeflow Pipelines SDK
-1. Trigger the pipeline with the help of `pipelines/trigger/main.py`
-1. (optional) Wait for pipeline to finish before returning if `wait` is set to `true` (default is false)
+- `build` - re-build containers for training & serving code (limit by setting targets=training to build only one of the containers)
+- `compile` - re-compile the pipeline to YAML
+- `wait` - run the pipeline (a-)sync
 
-#### Pipeline input parameters
-
-The ML pipelines have input parameters. As you can see in the pipeline definition files (`pipelines/src/pipelines/<training|prediction>/pipeline.py`), they have default values, and some of these default values are derived from environment variables (which in turn are defined in `env.sh`).
-
-When triggering ad hoc runs in your dev/sandbox environment, or when running the E2E tests in CI, these default values are used. For the test and production deployments, the pipeline parameters are defined in the Terraform code for the Cloud Scheduler jobs (`terraform/envs/<dev|test|prod>/scheduled_jobs.auto.tfvars`) - see the section on [Scheduling pipelines](#scheduling-pipelines).
-
-## Testing
+## Test
 
 Unit tests are performed using [pytest](https://docs.pytest.org).
-The unit tests for custom KFP components are run on each pull request. To run them on your local machine:
-
-```
-make test
-```
+The unit tests are run on each pull request. 
+To run them locally you can execute the following command and optionally enable or disable testing of components:
 
-Alternatively, only test one of the component groups by running:
 ```
-make test GROUP=vertex-components
+make test [ components=<true|false> ]
 ```
 
-There are also unit tests for the utility scripts in [pipelines/src/pipelines/utils](/pipelines/src/pipelines/utils/). To run them on your local machine:
+## Automation
 
-```
-make test
-```
-
-## Customize pipelines
-
-### Adding a new pipeline
-
-This repository contains a training and a (batch) prediction pipeline. To add another ML pipeline (e.g. for continuous evaluation), create a new directory under the `pipelines/src/pipelines` directory. Within your new pipeline folder, create a `pipeline.py` file - this is where you should provide your pipeline definition using the KFP DSL (in a function named `pipeline`).
-
-Alternatively, you can just copy and paste the `training` or `prediction` directory.
-
-See below for an example folder structure:
-
-```
-vertex-pipelines-end-to-end-samples
-|
-├── pipelines
-│   ├── src
-│   │   ├── pipelines
-│   │   │   ├── new_pipeline
-│   │   │   │   ├── pipeline.py
-│   │   │   │   └── queries
-│   │   │   │       └── my_query.sql
-```
-
-Make sure that you give the ML pipeline a unique name in the `@pipeline` decorator.
-
-To run your pipeline, use `make run` as before (optionally adding parameter to wait until pipeline is finished before returning - defaults to false):
-
-```bash
-make run pipeline=your_new_pipeline [ wait=<true|false> ]
-```
-
-Some of the scripts e.g. CI/CD pipelines assume only a training and prediction pipeline. You will need to adapt these to add in the compile, run and upload steps for your new pipeline in [cloudbuild/pr-checks.yaml](/cloudbuild/pr-checks.yaml), [cloudbuild/e2e-test.yaml](/cloudbuild/e2e-test.yaml) and [cloudbuild/release.yaml](/cloudbuild/release.yaml).
-
-### Scheduling pipelines
-
-Terraform is used to deploy Cloud Scheduler jobs that trigger the Vertex Pipeline runs. This is done by the CI/CD pipelines (see section below on CI/CD).
-
-To schedule pipelines into an environment, you will need to provide the `cloud_schedulers_config` variable to the Terraform configuration for the relevant environment. You can find an example of this configuration in [`terraform/modules/scheduled_pipelines/scheduled_jobs.auto.tfvars.example`](terraform/modules/scheduled_pipelines/scheduled_jobs.auto.tfvars.example). Copy this example file into the relevant directory for your environment (e.g. `terraform/envs/dev` for the dev environment) and remove the `.example` suffix. Adjust the configuration file as appropriate.
-
-## CI/CD
-
-For details on setting up CI/CD, see the [CI/CD README](/cloudbuild/README.md).
-
-For details on setting up CI/CD for the template codebase itself (instead of for your own ML use case), follow the guide [here](/docs/TESTING_SETUP.md).
+For details on setting up CI/CD, see [this README](/docs/AUTOMATION.md).
 
 ## Putting it all together
 
 For a full walkthrough of the journey from changing the ML pipeline code to having it scheduled and running in production, please see the guide [here](docs/PRODUCTION.md).
+
+We value your contribution, see [this guide](docs/CONTRIBUTION.md) for contributing to this project.
diff --git a/cloudbuild/pr-checks.yaml b/cloudbuild/pr-checks.yaml
@@ -27,7 +27,6 @@ steps:
         make install && \
         git init && \
         git add . && \
-        make pre-commit && \
         make compile pipeline=training && \
         make compile pipeline=prediction && \
         make test

diff --git a/...nents/bigquery-components/.python-version → components/.python-version b/...nents/bigquery-components/.python-version → components/.python-version