From 034c440e3048e81db3d321e6046c4b94ec93657d Mon Sep 17 00:00:00 2001
From: Felix Schaumann <felix.schaumann@datatonic.com>
Date: Thu, 11 May 2023 08:30:16 +0200
Subject: [PATCH] Fix missing model monitoring in XGB train script

---
 .../tensorflow/prediction/pipeline.py         | 16 +++++----
 .../training/assets/train_tf_model.py         | 13 ++++---
 .../pipelines/tensorflow/training/pipeline.py |  1 +
 .../pipelines/xgboost/prediction/pipeline.py  | 16 +++++----
 .../training/assets/train_xgb_model.py        | 35 ++++++++++++++-----
 .../pipelines/xgboost/training/pipeline.py    |  1 +
 6 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/pipelines/src/pipelines/tensorflow/prediction/pipeline.py b/pipelines/src/pipelines/tensorflow/prediction/pipeline.py
index 5121aea3..b011b0c1 100644
--- a/pipelines/src/pipelines/tensorflow/prediction/pipeline.py
+++ b/pipelines/src/pipelines/tensorflow/prediction/pipeline.py
@@ -108,12 +108,16 @@ def tensorflow_pipeline(
     ).set_display_name("Ingest data")
 
     # lookup champion model
-    champion_model = lookup_model(
-        model_name=model_name,
-        project_location=project_location,
-        project_id=project_id,
-        fail_on_model_not_found=True,
-    ).set_display_name("Look up champion model")
+    champion_model = (
+        lookup_model(
+            model_name=model_name,
+            project_location=project_location,
+            project_id=project_id,
+            fail_on_model_not_found=True,
+        )
+        .set_display_name("Look up champion model")
+        .set_caching_options(False)
+    )
 
     # batch predict from BigQuery to BigQuery
     bigquery_source_input_uri = f"bq://{project_id}.{dataset_id}.{ingested_table}"
diff --git a/pipelines/src/pipelines/tensorflow/training/assets/train_tf_model.py b/pipelines/src/pipelines/tensorflow/training/assets/train_tf_model.py
index c89a7880..5d164ae7 100644
--- a/pipelines/src/pipelines/tensorflow/training/assets/train_tf_model.py
+++ b/pipelines/src/pipelines/tensorflow/training/assets/train_tf_model.py
@@ -219,6 +219,9 @@ def _get_temp_dir(dirpath, task_id):
 parser.add_argument("--hparams", default={}, type=json.loads)
 args = parser.parse_args()
 
+if args.model.startswith("gs://"):
+    args.model = Path("/gcs/" + args.model[5:])
+
 # merge dictionaries by overwriting default_model_params if provided in model_params
 hparams = {**DEFAULT_HPARAMS, **args.hparams}
 logging.info(f"Using model hyper-parameters: {hparams}")
@@ -261,9 +264,9 @@ def _get_temp_dir(dirpath, task_id):
     logging.info("not chief node, exiting now")
     sys.exit()
 
-os.makedirs(args.model, exist_ok=True)
 logging.info(f"Save model to: {args.model}")
-tf_model.save(args.model, save_format="tf")
+args.model.mkdir(parents=True)
+tf_model.save(str(args.model), save_format="tf")
 
 logging.info(f"Save metrics to: {args.metrics}")
 eval_metrics = dict(zip(tf_model.metrics_names, tf_model.evaluate(test_ds)))
@@ -281,11 +284,13 @@ def _get_temp_dir(dirpath, task_id):
     json.dump(metrics, fp)
 
 # Persist URIs of training file(s) for model monitoring in batch predictions
-path = Path(args.model) / TRAINING_DATASET_INFO
+# See https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1beta1.types.ModelMonitoringObjectiveConfig.TrainingDataset  # noqa: E501
+# for the expected schema.
+path = args.model / TRAINING_DATASET_INFO
 training_dataset_for_monitoring = {
     "gcsSource": {"uris": [args.train_data]},
     "dataFormat": "csv",
-    "targetField": hparams["label"],
+    "targetField": label,
 }
 logging.info(f"Save training dataset info for model monitoring: {path}")
 logging.info(f"Training dataset: {training_dataset_for_monitoring}")
diff --git a/pipelines/src/pipelines/tensorflow/training/pipeline.py b/pipelines/src/pipelines/tensorflow/training/pipeline.py
index cb99e091..2767d210 100644
--- a/pipelines/src/pipelines/tensorflow/training/pipeline.py
+++ b/pipelines/src/pipelines/tensorflow/training/pipeline.py
@@ -215,6 +215,7 @@ def tensorflow_pipeline(
             fail_on_model_not_found=False,
         )
         .set_display_name("Lookup past model")
+        .set_caching_options(False)
         .outputs["model_resource_name"]
     )
 
diff --git a/pipelines/src/pipelines/xgboost/prediction/pipeline.py b/pipelines/src/pipelines/xgboost/prediction/pipeline.py
index 0bd1ec91..ed474eb8 100644
--- a/pipelines/src/pipelines/xgboost/prediction/pipeline.py
+++ b/pipelines/src/pipelines/xgboost/prediction/pipeline.py
@@ -102,12 +102,16 @@ def xgboost_pipeline(
     ).set_display_name("Ingest data")
 
     # lookup champion model
-    champion_model = lookup_model(
-        model_name=model_name,
-        project_location=project_location,
-        project_id=project_id,
-        fail_on_model_not_found=True,
-    ).set_display_name("Look up champion model")
+    champion_model = (
+        lookup_model(
+            model_name=model_name,
+            project_location=project_location,
+            project_id=project_id,
+            fail_on_model_not_found=True,
+        )
+        .set_display_name("Look up champion model")
+        .set_caching_options(False)
+    )
 
     # batch predict from BigQuery to BigQuery
     bigquery_source_input_uri = f"bq://{project_id}.{dataset_id}.{ingested_table}"
diff --git a/pipelines/src/pipelines/xgboost/training/assets/train_xgb_model.py b/pipelines/src/pipelines/xgboost/training/assets/train_xgb_model.py
index 31d95247..71cc65b5 100644
--- a/pipelines/src/pipelines/xgboost/training/assets/train_xgb_model.py
+++ b/pipelines/src/pipelines/xgboost/training/assets/train_xgb_model.py
@@ -1,4 +1,6 @@
 import argparse
+from pathlib import Path
+
 import joblib
 import json
 import os
@@ -14,7 +16,9 @@
 
 logging.basicConfig(level=logging.DEBUG)
 
-
+# used for monitoring during prediction time
+TRAINING_DATASET_INFO = "training_dataset.json"
+# numeric/categorical features in Chicago trips dataset to be preprocessed
 NUM_COLS = ["dayofweek", "hourofday", "trip_distance", "trip_miles", "trip_seconds"]
 ORD_COLS = ["company"]
 OHE_COLS = ["payment_type"]
@@ -39,6 +43,9 @@ def indices_in_list(elements: list, base_list: list) -> list:
 parser.add_argument("--hparams", default={}, type=json.loads)
 args = parser.parse_args()
 
+if args.model.startswith("gs://"):
+    args.model = Path("/gcs/" + args.model[5:])
+
 logging.info("Read csv files into dataframes")
 df_train = pd.read_csv(args.train_data)
 df_valid = pd.read_csv(args.valid_data)
@@ -111,15 +118,25 @@ def indices_in_list(elements: list, base_list: list) -> list:
     "rootMeanSquaredLogError": np.sqrt(metrics.mean_squared_log_error(y_test, y_pred)),
 }
 
-try:
-    model_path = args.model.replace("gs://", "/gcs/")
-    logging.info(f"Save model to: {model_path}")
-    os.makedirs(model_path, exist_ok=True)
-    joblib.dump(pipeline, model_path + "model.joblib")
-except Exception as e:
-    print(e)
-    raise e
+logging.info(f"Save model to: {args.model}")
+args.model.mkdir(parents=True)
+joblib.dump(pipeline, str(args.model / "model.joblib"))
 
 logging.info(f"Metrics: {metrics}")
 with open(args.metrics, "w") as fp:
     json.dump(metrics, fp)
+
+# Persist URIs of training file(s) for model monitoring in batch predictions
+# See https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform_v1beta1.types.ModelMonitoringObjectiveConfig.TrainingDataset  # noqa: E501
+# for the expected schema.
+path = args.model / TRAINING_DATASET_INFO
+training_dataset_for_monitoring = {
+    "gcsSource": {"uris": [args.train_data]},
+    "dataFormat": "csv",
+    "targetField": label,
+}
+logging.info(f"Training dataset info: {training_dataset_for_monitoring}")
+
+with open(path, "w") as fp:
+    logging.info(f"Save training dataset info for model monitoring: {path}")
+    json.dump(training_dataset_for_monitoring, fp)
diff --git a/pipelines/src/pipelines/xgboost/training/pipeline.py b/pipelines/src/pipelines/xgboost/training/pipeline.py
index 1db2023c..27cb47ed 100644
--- a/pipelines/src/pipelines/xgboost/training/pipeline.py
+++ b/pipelines/src/pipelines/xgboost/training/pipeline.py
@@ -212,6 +212,7 @@ def xgboost_pipeline(
             fail_on_model_not_found=False,
         )
         .set_display_name("Lookup past model")
+        .set_caching_options(False)
         .outputs["model_resource_name"]
     )