From e97adfb1984592a6f9e4eea8f8bdc3d2969e3d2d Mon Sep 17 00:00:00 2001
From: Shane Glass <23001651+shanecglass@users.noreply.github.com>
Date: Mon, 9 Oct 2023 15:27:46 -0400
Subject: [PATCH] feat!: data_warehosue migrating to TheLook Ecommerce dataset
(#257)
---
modules/data_warehouse/README.md | 4 +-
.../assets/data-warehouse-architecture.svg | 4 +-
modules/data_warehouse/bigquery.tf | 157 +++++++---
modules/data_warehouse/main.tf | 33 ++-
modules/data_warehouse/outputs.tf | 7 +-
.../schema/distribution_centers_schema.json | 22 ++
.../src/schema/events_schema.json | 67 +++++
.../src/schema/inventory_items_schema.json | 62 ++++
.../src/schema/order_items_schema.json | 57 ++++
.../src/schema/orders_schema.json | 47 +++
.../src/schema/products_schema.json | 47 +++
.../src/schema/users_schema.json | 77 +++++
.../src/sql/sp_bigqueryml_model.sql | 75 +++--
.../src/sql/sp_lookerstudio_report.sql | 274 +++++++++++++-----
.../src/sql/sp_provision_lookup_tables.sql | 48 +--
.../src/sql/sp_sample_queries.sql | 250 +++++++++-------
.../src/sql/sp_sample_translation_queries.sql | 61 +---
.../data_warehouse/src/taxi_trips_schema.json | 116 --------
.../data_warehouse/templates/workflow.tftpl | 7 +-
modules/data_warehouse/workflows.tf | 11 +-
20 files changed, 976 insertions(+), 450 deletions(-)
create mode 100644 modules/data_warehouse/src/schema/distribution_centers_schema.json
create mode 100644 modules/data_warehouse/src/schema/events_schema.json
create mode 100644 modules/data_warehouse/src/schema/inventory_items_schema.json
create mode 100644 modules/data_warehouse/src/schema/order_items_schema.json
create mode 100644 modules/data_warehouse/src/schema/orders_schema.json
create mode 100644 modules/data_warehouse/src/schema/products_schema.json
create mode 100644 modules/data_warehouse/src/schema/users_schema.json
delete mode 100644 modules/data_warehouse/src/taxi_trips_schema.json
diff --git a/modules/data_warehouse/README.md b/modules/data_warehouse/README.md
index c697bd28..ab535532 100644
--- a/modules/data_warehouse/README.md
+++ b/modules/data_warehouse/README.md
@@ -12,7 +12,7 @@ The resources/services/activations/deletions that this module will create/trigge
- Creates a BigQuery Dataset
- Creates a BigQuery Table
- Creates a Google Cloud Storage bucket
-- Loads the Google Cloud Storage bucket with data from https://console.cloud.google.com/marketplace/product/city-of-new-york/nyc-tlc-trips
+- Loads the Google Cloud Storage bucket with data from [TheLook eCommerce Public Dataset](https://console.cloud.google.com/marketplace/product/bigquery-public-data/thelook-ecommerce)
- Provides SQL examples
- Creates and inferences with a BigQuery ML model
- Creates a Looker Studio report
@@ -47,7 +47,7 @@ Functional examples are included in the
|------|-------------|
| bigquery\_editor\_url | The URL to launch the BigQuery editor with the sample query procedure opened |
| ds\_friendly\_name | Dataset name |
-| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for the taxi data analysis |
+| lookerstudio\_report\_url | The URL to create a new Looker Studio report displays a sample dashboard for the e-commerce data analysis |
| neos\_tutorial\_url | The URL to launch the in-console tutorial for the EDW solution |
| raw\_bucket | Raw bucket name |
diff --git a/modules/data_warehouse/assets/data-warehouse-architecture.svg b/modules/data_warehouse/assets/data-warehouse-architecture.svg
index 8fb9c7cc..e2c804cb 100644
--- a/modules/data_warehouse/assets/data-warehouse-architecture.svg
+++ b/modules/data_warehouse/assets/data-warehouse-architecture.svg
@@ -1,6 +1,6 @@
\ No newline at end of file
+ Google Cloud ProjectAPPLICATIONDATA SOURCES1DATA LAKE2435DATA WAREHOUSEDataDataDataVISUALIZATIONDATA ANALYSISLooker StudioBigQueryCloud StorageWorkflows
diff --git a/modules/data_warehouse/bigquery.tf b/modules/data_warehouse/bigquery.tf
index 33175b87..5c5dc324 100644
--- a/modules/data_warehouse/bigquery.tf
+++ b/modules/data_warehouse/bigquery.tf
@@ -18,12 +18,14 @@
# # Create the BigQuery dataset
resource "google_bigquery_dataset" "ds_edw" {
project = module.project-services.project_id
- dataset_id = "ds_edw"
+ dataset_id = "thelook"
friendly_name = "My EDW Dataset"
description = "My EDW Dataset with tables"
location = var.region
labels = var.labels
delete_contents_on_destroy = var.force_destroy
+
+ depends_on = [time_sleep.wait_after_apis]
}
# # Create a BigQuery connection
@@ -33,6 +35,7 @@ resource "google_bigquery_connection" "ds_connection" {
location = var.region
friendly_name = "Storage Bucket Connection"
cloud_resource {}
+ depends_on = [time_sleep.wait_after_apis]
}
# # Grant IAM access to the BigQuery Connection account for Cloud Storage
@@ -42,64 +45,146 @@ resource "google_storage_bucket_iam_binding" "bq_connection_iam_object_viewer" {
members = [
"serviceAccount:${google_bigquery_connection.ds_connection.cloud_resource[0].service_account_id}",
]
+}
- depends_on = [
- google_bigquery_connection.ds_connection,
- ]
+# # Create a Biglake table for events with metadata caching
+resource "google_bigquery_table" "tbl_edw_events" {
+ dataset_id = google_bigquery_dataset.ds_edw.dataset_id
+ table_id = "events"
+ project = module.project-services.project_id
+ deletion_protection = var.deletion_protection
+
+ schema = file("${path.module}/src/schema/events_schema.json")
+
+ external_data_configuration {
+ autodetect = true
+ connection_id = google_bigquery_connection.ds_connection.name
+ source_format = "PARQUET"
+ source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/events.parquet"]
+ }
+
+ labels = var.labels
}
-# # Create a BigQuery external table
-resource "google_bigquery_table" "tbl_edw_taxi" {
+# # Create a Biglake table for inventory_items
+resource "google_bigquery_table" "tbl_edw_inventory_items" {
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
- table_id = "taxi_trips"
+ table_id = "inventory_items"
project = module.project-services.project_id
deletion_protection = var.deletion_protection
+ schema = file("${path.module}/src/schema/inventory_items_schema.json")
+
external_data_configuration {
autodetect = true
- connection_id = "${module.project-services.project_id}.${var.region}.ds_connection"
+ connection_id = google_bigquery_connection.ds_connection.name
source_format = "PARQUET"
- source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/new-york-taxi-trips/tlc-yellow-trips-2022/taxi-*.Parquet"]
+ source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/inventory_items.parquet"]
+ }
+
+ labels = var.labels
+}
+
+# # Create a Biglake table with metadata caching for order_items
+resource "google_bigquery_table" "tbl_edw_order_items" {
+ dataset_id = google_bigquery_dataset.ds_edw.dataset_id
+ table_id = "order_items"
+ project = module.project-services.project_id
+ deletion_protection = var.deletion_protection
+ schema = file("${path.module}/src/schema/order_items_schema.json")
+
+ external_data_configuration {
+ autodetect = true
+ connection_id = google_bigquery_connection.ds_connection.name
+ source_format = "PARQUET"
+ source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/order_items.parquet"]
}
- schema = file("${path.module}/src/taxi_trips_schema.json")
+ labels = var.labels
+}
+
+# # Create a Biglake table for orders
+resource "google_bigquery_table" "tbl_edw_orders" {
+ dataset_id = google_bigquery_dataset.ds_edw.dataset_id
+ table_id = "orders"
+ project = module.project-services.project_id
+ deletion_protection = var.deletion_protection
+
+ schema = file("${path.module}/src/schema/orders_schema.json")
+
+ external_data_configuration {
+ autodetect = true
+ connection_id = google_bigquery_connection.ds_connection.name
+ source_format = "PARQUET"
+ source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/orders.parquet"]
+ }
labels = var.labels
+}
- depends_on = [
- google_bigquery_connection.ds_connection,
- google_storage_bucket.raw_bucket,
- ]
+# # Create a Biglake table for products
+resource "google_bigquery_table" "tbl_edw_products" {
+ dataset_id = google_bigquery_dataset.ds_edw.dataset_id
+ table_id = "products"
+ project = module.project-services.project_id
+ deletion_protection = var.deletion_protection
+
+ schema = file("${path.module}/src/schema/products_schema.json")
+
+ external_data_configuration {
+ autodetect = true
+ connection_id = google_bigquery_connection.ds_connection.name
+ source_format = "PARQUET"
+ source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/products.parquet"]
+ }
+
+ labels = var.labels
+}
+
+# # Create a Biglake table for products
+resource "google_bigquery_table" "tbl_edw_users" {
+ dataset_id = google_bigquery_dataset.ds_edw.dataset_id
+ table_id = "users"
+ project = module.project-services.project_id
+ deletion_protection = var.deletion_protection
+
+ schema = file("${path.module}/src/schema/users_schema.json")
+
+ external_data_configuration {
+ autodetect = true
+ connection_id = google_bigquery_connection.ds_connection.name
+ source_format = "PARQUET"
+ source_uris = ["gs://${google_storage_bucket.raw_bucket.name}/thelook-ecommerce/users.parquet"]
+ }
+
+ labels = var.labels
}
# Load Queries for Stored Procedure Execution
-# # Load Lookup Data Tables
+# # Load Distribution Center Lookup Data Tables
resource "google_bigquery_routine" "sp_provision_lookup_tables" {
project = module.project-services.project_id
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
routine_id = "sp_provision_lookup_tables"
routine_type = "PROCEDURE"
language = "SQL"
- definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id })
-
- depends_on = [
- google_bigquery_dataset.ds_edw,
- ]
+ definition_body = templatefile("${path.module}/src/sql/sp_provision_lookup_tables.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
}
-
-# # Add Looker Studio Data Report Procedure
-resource "google_bigquery_routine" "sproc_sp_demo_datastudio_report" {
+# Add Looker Studio Data Report Procedure
+resource "google_bigquery_routine" "sproc_sp_demo_lookerstudio_report" {
project = module.project-services.project_id
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
routine_id = "sp_lookerstudio_report"
routine_type = "PROCEDURE"
language = "SQL"
- definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id })
+ definition_body = templatefile("${path.module}/src/sql/sp_lookerstudio_report.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
depends_on = [
- google_bigquery_table.tbl_edw_taxi,
+ google_bigquery_table.tbl_edw_inventory_items,
+ google_bigquery_table.tbl_edw_order_items,
+ google_bigquery_routine.sp_provision_lookup_tables,
]
}
@@ -110,24 +195,26 @@ resource "google_bigquery_routine" "sp_sample_queries" {
routine_id = "sp_sample_queries"
routine_type = "PROCEDURE"
language = "SQL"
- definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id })
+ definition_body = templatefile("${path.module}/src/sql/sp_sample_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
depends_on = [
- google_bigquery_table.tbl_edw_taxi,
+ google_bigquery_table.tbl_edw_inventory_items,
+ google_bigquery_table.tbl_edw_order_items,
]
}
-# # Add Bigquery ML Model
+
+# Add Bigquery ML Model
resource "google_bigquery_routine" "sp_bigqueryml_model" {
project = module.project-services.project_id
dataset_id = google_bigquery_dataset.ds_edw.dataset_id
routine_id = "sp_bigqueryml_model"
routine_type = "PROCEDURE"
language = "SQL"
- definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id })
+ definition_body = templatefile("${path.module}/src/sql/sp_bigqueryml_model.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
depends_on = [
- google_bigquery_table.tbl_edw_taxi,
+ google_bigquery_table.tbl_edw_order_items,
]
}
@@ -138,10 +225,10 @@ resource "google_bigquery_routine" "sp_sample_translation_queries" {
routine_id = "sp_sample_translation_queries"
routine_type = "PROCEDURE"
language = "SQL"
- definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id })
+ definition_body = templatefile("${path.module}/src/sql/sp_sample_translation_queries.sql", { project_id = module.project-services.project_id, dataset_id = google_bigquery_dataset.ds_edw.dataset_id })
depends_on = [
- google_bigquery_table.tbl_edw_taxi,
+ google_bigquery_table.tbl_edw_inventory_items,
]
}
@@ -151,6 +238,8 @@ resource "google_project_service_identity" "bigquery_data_transfer_sa" {
provider = google-beta
project = module.project-services.project_id
service = "bigquerydatatransfer.googleapis.com"
+
+ depends_on = [time_sleep.wait_after_apis]
}
# # Grant the DTS service account access
@@ -162,6 +251,8 @@ resource "google_project_iam_member" "dts_service_account_roles" {
project = module.project-services.project_id
role = each.key
member = "serviceAccount:${google_project_service_identity.bigquery_data_transfer_sa.email}"
+
+ depends_on = [time_sleep.wait_after_apis]
}
# Create specific service account for DTS Run
@@ -206,7 +297,7 @@ resource "google_bigquery_data_transfer_config" "dts_config" {
data_source_id = "scheduled_query"
schedule = "every day 00:00"
params = {
- query = "CALL `${module.project-services.project_id}.ds_edw.sp_bigqueryml_model`()"
+ query = "CALL `${module.project-services.project_id}.${google_bigquery_dataset.ds_edw.dataset_id}.sp_bigqueryml_model`()"
}
service_account_name = google_service_account.dts.email
diff --git a/modules/data_warehouse/main.tf b/modules/data_warehouse/main.tf
index 6d452f99..c60fb5de 100644
--- a/modules/data_warehouse/main.tf
+++ b/modules/data_warehouse/main.tf
@@ -1,4 +1,4 @@
-/**
+/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -60,6 +60,11 @@ module "project-services" {
]
}
+resource "time_sleep" "wait_after_apis" {
+ create_duration = "120s"
+ depends_on = [module.project-services]
+}
+
// Create random ID to be used for deployment uniqueness
resource "random_id" "id" {
byte_length = 4
@@ -77,10 +82,12 @@ resource "google_storage_bucket" "raw_bucket" {
public_access_prevention = "enforced"
+ depends_on = [time_sleep.wait_after_apis]
+
labels = var.labels
}
-# # Set up the provisioning bucketstorage bucket
+# # Set up the provisioning storage bucket
resource "google_storage_bucket" "provisioning_bucket" {
name = "ds-edw-provisioner-${random_id.id.hex}"
project = module.project-services.project_id
@@ -90,6 +97,8 @@ resource "google_storage_bucket" "provisioning_bucket" {
public_access_prevention = "enforced"
+ depends_on = [time_sleep.wait_after_apis]
+
labels = var.labels
}
@@ -99,6 +108,8 @@ resource "google_pubsub_topic" "topic" {
name = "provisioning-topic"
project = module.project-services.project_id
+ depends_on = [time_sleep.wait_after_apis]
+
labels = var.labels
}
@@ -112,6 +123,8 @@ resource "google_pubsub_topic_iam_binding" "binding" {
# # Get the GCS service account to trigger the pub/sub notification
data "google_storage_project_service_account" "gcs_account" {
project = module.project-services.project_id
+
+ depends_on = [time_sleep.wait_after_apis]
}
# # Create the Storage trigger
@@ -120,7 +133,9 @@ resource "google_storage_notification" "notification" {
bucket = google_storage_bucket.provisioning_bucket.name
payload_format = "JSON_API_V1"
topic = google_pubsub_topic.topic.id
- depends_on = [google_pubsub_topic_iam_binding.binding]
+ depends_on = [
+ google_pubsub_topic_iam_binding.binding,
+ ]
}
# # Create the Eventarc trigger
@@ -132,7 +147,6 @@ resource "google_eventarc_trigger" "trigger_pubsub_tf" {
attribute = "type"
value = "google.cloud.pubsub.topic.v1.messagePublished"
-
}
destination {
workflow = google_workflows_workflow.workflow.id
@@ -148,7 +162,6 @@ resource "google_eventarc_trigger" "trigger_pubsub_tf" {
labels = var.labels
depends_on = [
- google_workflows_workflow.workflow,
google_project_iam_member.eventarc_service_account_invoke_role,
]
}
@@ -159,6 +172,8 @@ resource "google_service_account" "eventarc_service_account" {
project = module.project-services.project_id
account_id = "eventarc-sa-${random_id.id.hex}"
display_name = "Service Account for Cloud Eventarc"
+
+ depends_on = [time_sleep.wait_after_apis]
}
# # Grant the Eventarc service account Workflow Invoker Access
@@ -166,13 +181,9 @@ resource "google_project_iam_member" "eventarc_service_account_invoke_role" {
project = module.project-services.project_id
role = "roles/workflows.invoker"
member = "serviceAccount:${google_service_account.eventarc_service_account.email}"
-
- depends_on = [
- google_service_account.eventarc_service_account
- ]
}
-// Sleep for 60 seconds to drop start file
+// Sleep for 120 seconds to drop start file
resource "time_sleep" "wait_to_startfile" {
depends_on = [
google_storage_notification.notification,
@@ -180,7 +191,7 @@ resource "time_sleep" "wait_to_startfile" {
google_workflows_workflow.workflow
]
- create_duration = "60s"
+ create_duration = "120s"
}
// Drop start file for workflow to execute
diff --git a/modules/data_warehouse/outputs.tf b/modules/data_warehouse/outputs.tf
index 207794a9..646e4ada 100644
--- a/modules/data_warehouse/outputs.tf
+++ b/modules/data_warehouse/outputs.tf
@@ -24,13 +24,14 @@ output "raw_bucket" {
description = "Raw bucket name"
}
+#TODO Create new Looker Studio Template
output "lookerstudio_report_url" {
- value = "https://lookerstudio.google.com/reporting/create?c.reportId=402d64d6-2a14-45a1-b159-0dcc88c62cd5&ds.ds0.datasourceName=vw_taxi&ds.ds0.projectId=${var.project_id}&ds.ds0.type=TABLE&ds.ds0.datasetId=ds_edw&ds.ds0.tableId=vw_lookerstudio_report"
- description = "The URL to create a new Looker Studio report displays a sample dashboard for the taxi data analysis"
+ value = "https://lookerstudio.google.com/reporting/create?c.reportId=8a6517b8-8fcd-47a2-a953-9d4fb9ae4794&ds.ds_profit.datasourceName=lookerstudio_report_profit&ds.ds_profit.projectId=${module.project-services.project_id}&ds.ds_profit.type=TABLE&ds.ds_profit.datasetId=${google_bigquery_dataset.ds_edw.dataset_id}&ds.ds_profit.tableId=lookerstudio_report_profit&ds.ds_dc.datasourceName=lookerstudio_report_distribution_centers&ds.ds_dc.projectId=${module.project-services.project_id}&ds.ds_dc.type=TABLE&ds.ds_dc.datasetId=${google_bigquery_dataset.ds_edw.dataset_id}&ds.ds_dc.tableId=lookerstudio_report_distribution_centers"
+ description = "The URL to create a new Looker Studio report displays a sample dashboard for the e-commerce data analysis"
}
output "bigquery_editor_url" {
- value = "https://console.cloud.google.com/bigquery?project=${var.project_id}&ws=!1m5!1m4!6m3!1s${var.project_id}!2sds_edw!3ssp_sample_queries"
+ value = "https://console.cloud.google.com/bigquery?project=${module.project-services.project_id}&ws=!1m5!1m4!6m3!1s${module.project-services.project_id}!2s${google_bigquery_dataset.ds_edw.dataset_id}!3ssp_sample_queries"
description = "The URL to launch the BigQuery editor with the sample query procedure opened"
}
diff --git a/modules/data_warehouse/src/schema/distribution_centers_schema.json b/modules/data_warehouse/src/schema/distribution_centers_schema.json
new file mode 100644
index 00000000..c50e6779
--- /dev/null
+++ b/modules/data_warehouse/src/schema/distribution_centers_schema.json
@@ -0,0 +1,22 @@
+[
+ {
+ "mode": "NULLABLE",
+ "name": "id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "name",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "latitude",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "longitude",
+ "type": "FLOAT"
+ }
+]
diff --git a/modules/data_warehouse/src/schema/events_schema.json b/modules/data_warehouse/src/schema/events_schema.json
new file mode 100644
index 00000000..4a5a1f3f
--- /dev/null
+++ b/modules/data_warehouse/src/schema/events_schema.json
@@ -0,0 +1,67 @@
+[
+ {
+ "mode": "NULLABLE",
+ "name": "id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "user_id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "sequence_number",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "session_id",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "created_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "ip_address",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "city",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "state",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "postal_code",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "browser",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "traffic_source",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "uri",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "event_type",
+ "type": "STRING"
+ }
+]
diff --git a/modules/data_warehouse/src/schema/inventory_items_schema.json b/modules/data_warehouse/src/schema/inventory_items_schema.json
new file mode 100644
index 00000000..4b064798
--- /dev/null
+++ b/modules/data_warehouse/src/schema/inventory_items_schema.json
@@ -0,0 +1,62 @@
+[
+ {
+ "mode": "NULLABLE",
+ "name": "id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "created_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "sold_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "cost",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_category",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_name",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_brand",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_retail_price",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_department",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_sku",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_distribution_center_id",
+ "type": "INTEGER"
+ }
+]
diff --git a/modules/data_warehouse/src/schema/order_items_schema.json b/modules/data_warehouse/src/schema/order_items_schema.json
new file mode 100644
index 00000000..9b0d6829
--- /dev/null
+++ b/modules/data_warehouse/src/schema/order_items_schema.json
@@ -0,0 +1,57 @@
+[
+ {
+ "mode": "NULLABLE",
+ "name": "id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "order_id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "user_id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "product_id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "inventory_item_id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "status",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "created_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "shipped_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "delivered_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "returned_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "sale_price",
+ "type": "FLOAT"
+ }
+]
diff --git a/modules/data_warehouse/src/schema/orders_schema.json b/modules/data_warehouse/src/schema/orders_schema.json
new file mode 100644
index 00000000..bb872ca5
--- /dev/null
+++ b/modules/data_warehouse/src/schema/orders_schema.json
@@ -0,0 +1,47 @@
+[
+ {
+ "mode": "NULLABLE",
+ "name": "order_id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "user_id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "status",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "gender",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "created_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "returned_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "shipped_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "delivered_at",
+ "type": "TIMESTAMP"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "num_of_item",
+ "type": "INTEGER"
+ }
+]
diff --git a/modules/data_warehouse/src/schema/products_schema.json b/modules/data_warehouse/src/schema/products_schema.json
new file mode 100644
index 00000000..da918220
--- /dev/null
+++ b/modules/data_warehouse/src/schema/products_schema.json
@@ -0,0 +1,47 @@
+[
+ {
+ "mode": "NULLABLE",
+ "name": "id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "cost",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "category",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "name",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "brand",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "retail_price",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "department",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "sku",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "distribution_center_id",
+ "type": "INTEGER"
+ }
+]
diff --git a/modules/data_warehouse/src/schema/users_schema.json b/modules/data_warehouse/src/schema/users_schema.json
new file mode 100644
index 00000000..ae067f0d
--- /dev/null
+++ b/modules/data_warehouse/src/schema/users_schema.json
@@ -0,0 +1,77 @@
+[
+ {
+ "mode": "NULLABLE",
+ "name": "id",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "first_name",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "last_name",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "email",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "age",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "gender",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "state",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "street_address",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "postal_code",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "city",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "country",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "latitude",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "longitude",
+ "type": "FLOAT"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "traffic_source",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "created_at",
+ "type": "TIMESTAMP"
+ }
+]
diff --git a/modules/data_warehouse/src/sql/sp_bigqueryml_model.sql b/modules/data_warehouse/src/sql/sp_bigqueryml_model.sql
index 152ee298..83efd546 100644
--- a/modules/data_warehouse/src/sql/sp_bigqueryml_model.sql
+++ b/modules/data_warehouse/src/sql/sp_bigqueryml_model.sql
@@ -12,29 +12,60 @@
-- See the License for the specific language governing permissions and
-- limitations under the License.
-/* Run a query to see the prediction results of the model
+/*
+Run a query to see the results of the model
--
-select * from ML.PREDICT(MODEL ds_edw.model_taxi_estimate,
- TABLE ds_edw.taxi_trips)
- limit 1000; */
+SELECT
+ CONCAT('cluster ', CAST(centroid_id as STRING)) as cluster,
+ avg_spend as average_spend,
+ count_orders as count_of_orders,
+ days_since_order
+FROM (
+ SELECT
+ centroid_id,
+ feature,
+ ROUND(numerical_value, 2) as value
+ FROM
+ ML.CENTROIDS(MODEL `${dataset_id}.customer_segment_clustering`)
+)
+PIVOT (
+ SUM(value)
+ FOR feature IN ('avg_spend', 'count_orders', 'days_since_order')
+)
+ORDER BY centroid_id
+*/
--Model Example
CREATE OR REPLACE MODEL
- `${project_id}.ds_edw.model_taxi_estimate`
-OPTIONS ( MODEL_TYPE='LINEAR_REG',
- LS_INIT_LEARN_RATE=0.15,
- L1_REG=1,
- MAX_ITERATIONS=5 ) AS
-SELECT
- pickup_datetime,
- dropoff_datetime,
- IFNULL(passenger_count,0) passenger_count,
- IFNULL(trip_distance,0) trip_distance,
- IFNULL(rate_code,'') rate_code,
- IFNULL(payment_type,'') payment_type,
- IFNULL(fare_amount,0) label,
- IFNULL(pickup_location_id,'') pickup_location_id
-FROM
- `${project_id}.ds_edw.taxi_trips`
-WHERE
- fare_amount > 0;
+ `${project_id}.${dataset_id}.customer_segment_clustering`
+ OPTIONS(
+ MODEL_TYPE = 'KMEANS', -- model name
+ NUM_CLUSTERS = 5, -- how many clusters to create
+ KMEANS_INIT_METHOD = 'KMEANS++',
+ STANDARDIZE_FEATURES = TRUE -- note: normalization taking place to scale the range of independent variables (each feature contributes proportionately to the final distance)
+ )
+ AS (
+ SELECT
+ * EXCEPT (user_id)
+ FROM (
+ SELECT
+ user_id,
+ DATE_DIFF(CURRENT_DATE(), CAST(MAX(order_created_date) as DATE), day) as days_since_order, ---RECENCY
+ COUNT(DISTINCT order_id) as count_orders, --FREQUENCY
+ AVG(sale_price) as avg_spend --MONETARY
+ FROM (
+ SELECT
+ user_id,
+ order_id,
+ sale_price,
+ created_at as order_created_date
+ FROM
+ `${project_id}.${dataset_id}.order_items`
+ WHERE
+ created_at BETWEEN TIMESTAMP('2020-07-31 00:00:00')
+ AND TIMESTAMP('2023-07-31 00:00:00')
+ )
+ GROUP BY user_id
+ )
+ )
+;
diff --git a/modules/data_warehouse/src/sql/sp_lookerstudio_report.sql b/modules/data_warehouse/src/sql/sp_lookerstudio_report.sql
index 423ed5a5..88d643ca 100644
--- a/modules/data_warehouse/src/sql/sp_lookerstudio_report.sql
+++ b/modules/data_warehouse/src/sql/sp_lookerstudio_report.sql
@@ -12,99 +12,213 @@
-- See the License for the specific language governing permissions and
-- limitations under the License.
-CREATE OR REPLACE TABLE `${project_id}.ds_edw.lookerstudio_report`
+CREATE OR REPLACE VIEW `${project_id}.${dataset_id}.lookerstudio_report_distribution_centers`
OPTIONS(
labels=[("data-warehouse","true")]
)
AS
-WITH TaxiData AS
+WITH OrdersData AS
(
-SELECT VENDOR_ID as TaxiCompany,
- EXTRACT(YEAR FROM Pickup_DateTime) AS Year,
- EXTRACT(WEEK FROM Pickup_DateTime) AS WeekNumber,
- CONCAT('Week ',FORMAT("%02d",
- EXTRACT(WEEK FROM Pickup_DateTime))) AS WeekName,
- CONCAT(VENDOR_ID,':',EXTRACT(YEAR FROM Pickup_DateTime),':',FORMAT("%02d",EXTRACT(WEEK FROM Pickup_DateTime))) AS GroupPartition,
- COUNT(1) AS NumberOfRides,
- AVG(Trip_Distance) AS AvgDistance,
- SUM(Fare_Amount) AS Total_Fare_Amount,
- SUM(Extra) AS Total_Surcharge,
- SUM(MTA_Tax) AS Total_MTA_Tax,
- SUM(Tolls_Amount) AS Total_Tolls_Amount,
- SUM(imp_Surcharge) AS Total_Improvement_Surcharge,
- SUM(Tip_Amount) AS Total_Tip_Amount,
- SUM(Total_Amount) AS Total_Total_Amount
- FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips
- WHERE Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-02' --'2015-01-01' AND '2021-12-31' -- There is odd data in some of the source files from NYC
- GROUP BY 1, 2, 3, 4, 5
+ SELECT dc.name AS distribution_center_name,
+ EXTRACT(YEAR FROM order_items.created_at) AS year,
+ EXTRACT(WEEK FROM order_items.created_at) AS week_number,
+ CONCAT('Week ',FORMAT("%02d",
+ EXTRACT(WEEK FROM order_items.created_at))) AS week_name,
+ EXTRACT(DATE FROM TIMESTAMP_TRUNC(order_items.created_at, WEEK)) AS week_start_date,
+ CONCAT(product_distribution_center_id,':',EXTRACT(YEAR FROM order_items.created_at),
+ ':',FORMAT("%02d",EXTRACT(WEEK FROM order_items.created_at))) AS GroupPartition,
+ COUNT(order_items.product_id) AS products_ordered_count,
+ COUNT(DISTINCT order_items.order_id) AS orders_count,
+ SUM(cost) AS inventory_sold_cost_total,
+ AVG(cost) AS inventory_sold_cost_avg,
+ SUM(order_items.sale_price - cost) AS profit_total,
+ AVG(order_items.sale_price - cost) AS profit_avg,
+ AVG(TIMESTAMP_DIFF(delivered_at, shipped_at, HOUR)) AS shipping_hours,
+ AVG(TIMESTAMP_DIFF(shipped_at, order_items.created_at, HOUR)) AS processing_hours,
+ AVG(TIMESTAMP_DIFF(delivered_at, order_items.created_at, HOUR)) AS order_to_delivery_hours
+ FROM
+ `${project_id}.${dataset_id}.order_items` AS order_items
+ JOIN
+ `${project_id}.${dataset_id}.inventory_items` AS inventory_items ON order_items.product_id = inventory_items.product_id AND order_items.inventory_item_id = inventory_items.id
+ JOIN
+ `${project_id}.${dataset_id}.distribution_centers` AS dc ON inventory_items.product_distribution_center_id = dc.id
+ WHERE
+ order_items.created_at IS NOT NULL
+ AND order_items.created_at <= CURRENT_TIMESTAMP()
+ GROUP BY 1, 2, 3, 4, 5, 6
)
, LagPercents AS
(
-SELECT TaxiCompany,
- Year,
- WeekNumber,
- WeekName,
- NumberOfRides,
- GroupPartition,
- AvgDistance,
- Total_Fare_Amount,
- Total_Surcharge,
- Total_MTA_Tax,
- Total_Tolls_Amount,
- Total_Improvement_Surcharge,
- Total_Tip_Amount,
- Total_Total_Amount,
- LAG(NumberOfRides) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_NumberOfRides,
- LAG(AvgDistance) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_AvgDistance,
- LAG(Total_Fare_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Fare_Amount,
- LAG(Total_Surcharge) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Surcharge,
- LAG(Total_MTA_Tax) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_MTA_Tax,
- LAG(Total_Tolls_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Tolls_Amount,
- LAG(Total_Improvement_Surcharge) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Improvement_Surcharge,
- LAG(Total_Tip_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Tip_Amount,
- LAG(Total_Total_Amount) OVER (PARTITION BY TaxiCompany ORDER BY Year, WeekNumber ASC) AS Prior_Week_Total_Total_Amount
- FROM TaxiData
+ SELECT distribution_center_name,
+ year,
+ week_number,
+ week_name,
+ week_start_date,
+ GroupPartition,
+ products_ordered_count,
+ orders_count,
+ profit_total,
+ profit_avg,
+ inventory_sold_cost_total,
+ inventory_sold_cost_avg,
+ shipping_hours,
+ processing_hours,
+ order_to_delivery_hours,
+ LAG(products_ordered_count) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_products_ordered_count,
+ LAG(orders_count) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_orders_count,
+ LAG(profit_total) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_profit_total,
+ LAG(profit_avg) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_profit_avg,
+ LAG(inventory_sold_cost_total) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_inventory_sold_cost_total,
+ LAG(inventory_sold_cost_avg) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_inventory_sold_cost_avg,
+ LAG(shipping_hours) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_shipping_hours,
+ LAG(processing_hours) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_processing_hours,
+ LAG(order_to_delivery_hours) OVER (PARTITION BY distribution_center_name ORDER BY year DESC, week_number DESC) AS prior_week_order_to_delivery_hours
+ FROM OrdersData
)
, PercentChange AS
(
-SELECT TaxiCompany,
- Year,
- WeekNumber,
- WeekName,
- GroupPartition,
- NumberOfRides,
- AvgDistance,
- Total_Fare_Amount,
- Total_Surcharge,
- Total_MTA_Tax,
- Total_Tolls_Amount,
- Total_Improvement_Surcharge,
- Total_Tip_Amount,
- Total_Total_Amount,
- Prior_Week_NumberOfRides,
- Prior_Week_AvgDistance,
- Prior_Week_Total_Fare_Amount,
- Prior_Week_Total_Surcharge,
- Prior_Week_Total_MTA_Tax,
- Prior_Week_Total_Tolls_Amount,
- Prior_Week_Total_Improvement_Surcharge,
- Prior_Week_Total_Tip_Amount,
- Prior_Week_Total_Total_Amount,
- SAFE_DIVIDE(CAST(NumberOfRides - Prior_Week_NumberOfRides AS NUMERIC) , CAST(Prior_Week_NumberOfRides AS NUMERIC)) AS PercentChange_NumberOfRides,
- SAFE_DIVIDE(CAST(AvgDistance - Prior_Week_AvgDistance AS NUMERIC) , CAST(Prior_Week_AvgDistance AS NUMERIC)) AS PercentChange_AvgDistance,
- SAFE_DIVIDE((Total_Fare_Amount - Prior_Week_Total_Fare_Amount) , Prior_Week_Total_Fare_Amount) AS PercentChange_Total_Fare_Amount,
- SAFE_DIVIDE((Total_Surcharge - Prior_Week_Total_Surcharge) , Prior_Week_Total_Surcharge) AS PercentChange_Total_Surcharge,
- SAFE_DIVIDE((Total_MTA_Tax - Prior_Week_Total_MTA_Tax) , Prior_Week_Total_MTA_Tax) AS PercentChange_Total_MTA_Tax,
- SAFE_DIVIDE((Total_Tolls_Amount - Prior_Week_Total_Tolls_Amount) , Prior_Week_Total_Tolls_Amount) AS PercentChange_Total_Tolls_Amount,
- SAFE_DIVIDE((Total_Improvement_Surcharge - Prior_Week_Total_Improvement_Surcharge) , Prior_Week_Total_Improvement_Surcharge) AS PercentChange_Total_Improvement_Surcharge,
- SAFE_DIVIDE((Total_Tip_Amount - Prior_Week_Total_Tip_Amount) , Prior_Week_Total_Tip_Amount) AS PercentChange_Total_Tip_Amount,
- SAFE_DIVIDE((Total_Total_Amount - Prior_Week_Total_Total_Amount) , Prior_Week_Total_Total_Amount) AS PercentChange_Total_Total_Amount
+ SELECT distribution_center_name,
+ year,
+ week_number,
+ week_name,
+ week_start_date,
+ GroupPartition,
+ products_ordered_count,
+ orders_count,
+ profit_total,
+ profit_avg,
+ inventory_sold_cost_total,
+ inventory_sold_cost_avg,
+ shipping_hours,
+ processing_hours,
+ order_to_delivery_hours,
+ prior_week_products_ordered_count,
+ prior_week_orders_count,
+ prior_week_profit_total,
+ prior_week_profit_avg,
+ prior_week_inventory_sold_cost_total,
+ prior_week_inventory_sold_cost_avg,
+ prior_week_shipping_hours,
+ prior_week_processing_hours,
+ prior_week_order_to_delivery_hours,
+ SAFE_DIVIDE(CAST(products_ordered_count - prior_week_products_ordered_count AS NUMERIC) , CAST(prior_week_products_ordered_count AS NUMERIC)) AS percent_change_products_ordered_count,
+ SAFE_DIVIDE(CAST(orders_count - prior_week_orders_count AS NUMERIC) , CAST(prior_week_orders_count AS NUMERIC)) AS percent_change_orders_count,
+ SAFE_DIVIDE((profit_total - prior_week_profit_total) , prior_week_profit_total) AS percent_change_profit_total,
+ SAFE_DIVIDE((profit_avg - prior_week_profit_avg) , prior_week_profit_avg) AS percent_change_profit_avg,
+ SAFE_DIVIDE((inventory_sold_cost_total - prior_week_inventory_sold_cost_total) , prior_week_inventory_sold_cost_total) AS percent_change_inventory_sold_cost_total,
+ SAFE_DIVIDE((inventory_sold_cost_avg - prior_week_inventory_sold_cost_avg) , prior_week_inventory_sold_cost_avg) AS percent_change_inventory_sold_cost_avg,
+ SAFE_DIVIDE((shipping_hours - prior_week_shipping_hours) , prior_week_shipping_hours) AS percent_change_shipping_hours,
+ SAFE_DIVIDE((processing_hours - prior_week_processing_hours) , prior_week_processing_hours) AS percent_change_processing_hours,
+ SAFE_DIVIDE((order_to_delivery_hours - prior_week_order_to_delivery_hours) , prior_week_order_to_delivery_hours) AS percent_change_order_to_delivery_hours
FROM LagPercents
)
SELECT *
- FROM PercentChange
+FROM PercentChange
ORDER BY GroupPartition;
-CREATE OR REPLACE VIEW `${project_id}.ds_edw.vw_lookerstudio_report` as
-SELECT * FROM `${project_id}.ds_edw.lookerstudio_report`
-WHERE Year in (2022);
+CREATE OR REPLACE VIEW `${project_id}.${dataset_id}.lookerstudio_report_profit`
+OPTIONS(
+ labels=[("data-warehouse","true")]
+)
+AS
+with SubsetInventory AS(
+ SELECT
+ SUM(ROUND(product_retail_price,2)) AS revenue_total,
+ SUM(ROUND(cost,2)) AS cost_total,
+ SUM(ROUND(product_retail_price-cost, 2)) AS profit_total,
+ CONCAT(product_department, " - ", product_category) AS product_dept_cat,
+ EXTRACT(DATE from sold_at) AS sold_at_day
+ FROM
+ `${project_id}.${dataset_id}.inventory_items`
+ WHERE
+ sold_at <= CURRENT_TIMESTAMP()
+ GROUP BY
+ product_dept_cat, sold_at_day
+),
+
+Inventory7d AS (
+ SELECT
+ product_dept_cat,
+ sold_at_day AS day,
+ revenue_total,
+ cost_total,
+ profit_total,
+ SUM(ROUND(revenue_total,2)) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(sold_at_day) ASC RANGE BETWEEN 6 PRECEDING and CURRENT ROW) AS revenue_last_7d,
+ SUM(ROUND(cost_total,2)) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(sold_at_day) ASC RANGE BETWEEN 6 PRECEDING and CURRENT ROW) AS cost_last_7d
+ FROM
+ SubsetInventory
+),
+
+Lags AS (
+ SELECT
+ product_dept_cat,
+ day,
+ revenue_total,
+ cost_total,
+ profit_total,
+ revenue_last_7d,
+ cost_last_7d,
+ ROUND(SAFE_SUBTRACT(revenue_last_7d, cost_last_7d),2) AS profit_last_7d,
+ LAG(revenue_last_7d,30) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(day) ASC) AS prior_month_revenue_last_7d,
+ LAG(cost_last_7d,30) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(day) ASC) AS prior_month_cost_last_7d,
+ LAG(revenue_last_7d,365) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(day) ASC) AS prior_year_revenue_last_7d,
+ LAG(cost_last_7d,365) OVER (PARTITION BY product_dept_cat ORDER BY UNIX_DATE(day) ASC) AS prior_year_cost_last_7d,
+ FROM
+ Inventory7d
+),
+
+LagPercentages AS (
+ SELECT
+ day,
+ product_dept_cat,
+ revenue_total,
+ cost_total,
+ profit_total,
+ revenue_last_7d,
+ prior_month_revenue_last_7d,
+ prior_year_revenue_last_7d,
+ SAFE_DIVIDE((revenue_last_7d - prior_month_revenue_last_7d), prior_month_revenue_last_7d) AS percent_change_revenue_month,
+ SAFE_DIVIDE((revenue_last_7d - prior_year_revenue_last_7d), prior_year_revenue_last_7d) AS percent_change_revenue_year,
+ cost_last_7d,
+ prior_month_cost_last_7d,
+ prior_year_cost_last_7d,
+ SAFE_DIVIDE((cost_last_7d - prior_month_cost_last_7d), prior_month_cost_last_7d) AS percent_change_cost_month,
+ SAFE_DIVIDE((cost_last_7d - prior_year_cost_last_7d), prior_year_cost_last_7d) AS percent_change_cost_year,
+ profit_last_7d,
+ ROUND(SAFE_SUBTRACT(prior_month_revenue_last_7d, prior_month_cost_last_7d),2) AS prior_month_profit_last_7d,
+ ROUND(SAFE_SUBTRACT(prior_year_revenue_last_7d, prior_year_cost_last_7d),2) AS prior_year_profit_last_7d,
+ FROM
+ Lags
+),
+
+ProfitPercentages AS (
+ SELECT
+ day,
+ product_dept_cat,
+ revenue_total,
+ revenue_last_7d,
+ prior_month_revenue_last_7d,
+ percent_change_revenue_month,
+ prior_year_revenue_last_7d,
+ percent_change_revenue_year,
+ cost_total,
+ cost_last_7d,
+ prior_month_cost_last_7d,
+ percent_change_cost_month,
+ prior_year_cost_last_7d,
+ percent_change_cost_year,
+ profit_total,
+ profit_last_7d,
+ prior_month_profit_last_7d,
+ SAFE_DIVIDE((profit_last_7d - prior_month_profit_last_7d), prior_month_profit_last_7d) AS percent_change_profit_month,
+ prior_year_profit_last_7d,
+ SAFE_DIVIDE((profit_last_7d - prior_year_profit_last_7d), prior_year_profit_last_7d) AS percent_change_profit_year
+ FROM
+ LagPercentages
+ ORDER BY
+ day DESC
+)
+
+SELECT *
+FROM ProfitPercentages
+ORDER BY day DESC;
diff --git a/modules/data_warehouse/src/sql/sp_provision_lookup_tables.sql b/modules/data_warehouse/src/sql/sp_provision_lookup_tables.sql
index 1cacb53b..3a6457f5 100644
--- a/modules/data_warehouse/src/sql/sp_provision_lookup_tables.sql
+++ b/modules/data_warehouse/src/sql/sp_provision_lookup_tables.sql
@@ -12,36 +12,36 @@
-- See the License for the specific language governing permissions and
-- limitations under the License.
-CREATE OR REPLACE TABLE `${project_id}.ds_edw.vendor`
- (
- Vendor_Id INTEGER,
- Vendor_Description STRING
- )
-OPTIONS(
- labels=[("data-warehouse","true")]
-)
-AS
-SELECT 1, 'Creative Mobile Technologies, LLC'
-UNION ALL
-SELECT 2, 'VeriFone Inc.';
+CREATE OR REPLACE TABLE `${project_id}.${dataset_id}.distribution_centers`
-CREATE OR REPLACE TABLE `${project_id}.ds_edw.payment_type`
(
- Payment_Type_Id INTEGER,
- Payment_Type_Description STRING
+ id INTEGER,
+ name STRING,
+ longitude FLOAT64,
+ latitude FLOAT64,
+ distribution_center_geom GEOGRAPHY
)
-OPTIONS(
- labels=[("data-warehouse","true")]
-)
+ OPTIONS(
+ labels=[("data-warehouse","true")]
+ )
AS
-SELECT 1, 'Credit card'
+SELECT 1, 'Memphis TN', -89.9711, 35.1174, ST_GEOGPOINT(-89.9711, 35.1174)
+UNION ALL
+SELECT 2, 'Chicago IL', -87.6847, 41.8369, ST_GEOGPOINT(-87.6847, 41.8369)
+UNION ALL
+SELECT 3, 'Houston TX', -95.3698, 29.7604, ST_GEOGPOINT(-95.3698, 29.7604)
+UNION ALL
+SELECT 4, 'Los Angeles CA', -118.25, 34.05, ST_GEOGPOINT(-118.25, 34.05)
+UNION ALL
+SELECT 5, 'New Orleans LA', -90.0667, 29.95, ST_GEOGPOINT(-90.0667, 29.95)
UNION ALL
-SELECT 2, 'Cash'
+SELECT 6, 'Port Authority of New York/New Jersey NY/NJ', -73.7834, 40.634, ST_GEOGPOINT(-73.7834, 40.634)
UNION ALL
-SELECT 3, 'No charge'
+SELECT 7, 'Philadelphia PA', -75.1667, 39.95, ST_GEOGPOINT(-75.1667, 39.95)
UNION ALL
-SELECT 4, 'Dispute'
+SELECT 8, 'Mobile AL', -88.0431, 30.6944, ST_GEOGPOINT(-88.0431, 30.6944)
UNION ALL
-SELECT 5, 'Unknown'
+SELECT 9, 'Charleston SC', -79.9333, 32.7833, ST_GEOGPOINT(-79.9333, 32.7833)
UNION ALL
-SELECT 6, 'Voided trip';
+SELECT 10, 'Savannah GA', -81.1167, 32.0167, ST_GEOGPOINT(-81.1167, 32.0167)
+;
diff --git a/modules/data_warehouse/src/sql/sp_sample_queries.sql b/modules/data_warehouse/src/sql/sp_sample_queries.sql
index c88183ba..fdb4ace3 100644
--- a/modules/data_warehouse/src/sql/sp_sample_queries.sql
+++ b/modules/data_warehouse/src/sql/sp_sample_queries.sql
@@ -29,118 +29,118 @@ Clean up / Reset script:
--Rank, Pivot, Json
--- Query: Get trips over $50 for each day of the week.
--- Shows: Date Functions, Joins, Group By, Having, Ordinal Group/Having
-SELECT FORMAT_DATE("%w", Pickup_DateTime) AS WeekdayNumber,
- FORMAT_DATE("%A", Pickup_DateTime) AS WeekdayName,
- vendor.Vendor_Description,
- payment_type.Payment_Type_Description,
- SUM(taxi_trips.Total_Amount) AS high_value_trips
- FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips
- INNER JOIN `${project_id}.ds_edw.vendor` AS vendor
- ON cast(taxi_trips.Vendor_Id as INT64) = vendor.Vendor_Id
- AND taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01'
- LEFT JOIN `${project_id}.ds_edw.payment_type` AS payment_type
- ON cast(taxi_trips.payment_type as INT64) = payment_type.Payment_Type_Id
-GROUP BY 1, 2, 3, 4
-HAVING SUM(taxi_trips.Total_Amount) > 50
-ORDER BY WeekdayNumber, 3, 4;
+-- Query: See the order price quartiles for each day of the week.
+-- Shows: Date Functions, Joins, Group By, Having, Ordinal Group/Having, Quantiles
+SELECT
+ FORMAT_DATE("%w", created_at) AS WeekdayNumber,
+ FORMAT_DATE("%A", created_at) AS WeekdayName,
+ APPROX_QUANTILES(order_price, 4) AS quartiles
+ FROM (
+ SELECT
+ created_at,
+ SUM(sale_price) AS order_price
+ FROM
+ `${project_id}.${dataset_id}.order_items`
+ GROUP BY
+ order_id, 1
+ HAVING SUM(sale_price) > 10)
+ GROUP BY
+ 1, 2
+ ORDER BY
+ WeekdayNumber, 3
+;
+-- Query: Items with less than 30 days of inventory remaining
+WITH Orders AS (
+ SELECT
+ order_items.product_id AS product_id,
+ COUNT(order_items.id) AS count_sold_30d
+ FROM
+ `${project_id}.${dataset_id}.order_items` AS order_items
+ WHERE
+ order_items.created_at > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 30 DAY)
+ GROUP BY
+ product_id
+),
--- Query: amounts (Cash/Credit) by passenger type
-WITH TaxiDataRanking AS
-(
-SELECT CAST(Pickup_DateTime AS DATE) AS Pickup_Date,
- cast(taxi_trips.payment_type as INT64) as Payment_Type_Id,
- taxi_trips.Passenger_Count,
- taxi_trips.Total_Amount,
- RANK() OVER (PARTITION BY CAST(Pickup_DateTime AS DATE),
- taxi_trips.payment_type
- ORDER BY taxi_trips.Passenger_Count DESC,
- taxi_trips.Total_Amount DESC) AS Ranking
- FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips
- WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01'
- AND cast(taxi_trips.payment_type as INT64) IN (1,2)
-)
-SELECT Pickup_Date,
- Payment_Type_Description,
- Passenger_Count,
- Total_Amount
- FROM TaxiDataRanking
- INNER JOIN `${project_id}.ds_edw.payment_type` AS payment_type
- ON TaxiDataRanking.Payment_Type_Id = payment_type.Payment_Type_Id
-WHERE Ranking = 1
-ORDER BY Pickup_Date, Payment_Type_Description;
-
+OnHand AS (
+ SELECT
+ inventory.product_id AS product_id,
+ inventory.product_name AS product_name,
+ COUNT(inventory.id) AS count_in_stock
+ FROM
+ `${project_id}.${dataset_id}.inventory_items` AS inventory
+ WHERE
+ inventory.sold_at IS NULL
+ GROUP BY
+ product_id,
+ product_name
+ ORDER BY
+ count_in_stock DESC
+),
--- Query: data summed by payment type and passenger count, then pivoted based upon payment type
-WITH MonthlyData AS
-(
-SELECT FORMAT_DATE("%B", taxi_trips.Pickup_DateTime) AS MonthName,
- FORMAT_DATE("%m", taxi_trips.Pickup_DateTime) AS MonthNumber,
- CASE WHEN cast(taxi_trips.payment_type as INT64) = 1 THEN 'Credit'
- WHEN cast(taxi_trips.payment_type as INT64) = 2 THEN 'Cash'
- WHEN cast(taxi_trips.payment_type as INT64) = 3 THEN 'NoCharge'
- WHEN cast(taxi_trips.payment_type as INT64) = 4 THEN 'Dispute'
- END AS PaymentDescription,
- taxi_trips.Passenger_Count,
- taxi_trips.Total_Amount
- FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips
- WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01'
- AND Passenger_Count IS NOT NULL
- AND cast(payment_type as INT64) IN (1,2,3,4)
+End30dInventory AS (
+ SELECT
+ OnHand.*,
+ Orders.count_sold_30d,
+ count_in_stock - count_sold_30d AS expected_inventory_30d
+ FROM
+ OnHand
+ INNER JOIN
+ Orders USING (product_id)
)
-SELECT MonthName,
- Passenger_Count,
- FORMAT("%'d", CAST(Credit AS INTEGER)) AS Credit,
- FORMAT("%'d", CAST(Cash AS INTEGER)) AS Cash,
- FORMAT("%'d", CAST(NoCharge AS INTEGER)) AS NoCharge,
- FORMAT("%'d", CAST(Dispute AS INTEGER)) AS Dispute
- FROM MonthlyData
- PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'NoCharge', 'Dispute'))
-ORDER BY MonthNumber, Passenger_Count;
+SELECT
+ RANK() OVER (ORDER BY expected_inventory_30d ASC) AS rank,
+ End30dInventory.product_name,
+ End30dInventory.expected_inventory_30d,
+ End30dInventory.count_in_stock AS current_stock,
+ End30dInventory.count_sold_30d
+FROM
+ End30dInventory
+ORDER BY
+ rank ASC, current_stock DESC
+;
--- Query: data pivoted by payment type
-WITH MonthlyData AS
-(
-SELECT FORMAT_DATE("%B", taxi_trips.Pickup_DateTime) AS MonthName,
- FORMAT_DATE("%m", taxi_trips.Pickup_DateTime) AS MonthNumber,
- CASE WHEN cast(taxi_trips.payment_type as INT64) = 1 THEN 'Credit'
- WHEN cast(taxi_trips.payment_type as INT64) = 2 THEN 'Cash'
- WHEN cast(taxi_trips.payment_type as INT64) = 3 THEN 'NoCharge'
- WHEN cast(taxi_trips.payment_type as INT64) = 4 THEN 'Dispute'
- END AS PaymentDescription,
- SUM(taxi_trips.Total_Amount) AS Total_Amount
- FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips
- WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01'
- AND Passenger_Count IS NOT NULL
- AND cast(taxi_trips.payment_type as INT64) IN (1,2,3,4)
- GROUP BY 1, 2, 3
+-- Query: data summed by month, then pivoted by department
+with MonthlyData AS(
+ SELECT
+ sold_at,
+ FORMAT_DATE("%B", inventory.sold_at) AS month_name,
+ FORMAT_DATE("%m", inventory.sold_at) AS month_number,
+ SAFE_SUBTRACT(inventory.product_retail_price, inventory.cost) AS profit,
+ inventory.product_department AS product_department
+ FROM
+ `${project_id}.${dataset_id}.inventory_items` AS inventory
+ WHERE
+ sold_at IS NOT NULL
)
-SELECT MonthName,
- FORMAT("%'d", CAST(Credit AS INTEGER)) AS Credit,
- FORMAT("%'d", CAST(Cash AS INTEGER)) AS Cash,
- FORMAT("%'d", CAST(NoCharge AS INTEGER)) AS NoCharge,
- FORMAT("%'d", CAST(Dispute AS INTEGER)) AS Dispute
- FROM MonthlyData
- PIVOT(SUM(Total_Amount) FOR PaymentDescription IN ('Credit', 'Cash', 'NoCharge', 'Dispute'))
-ORDER BY MonthNumber;
+SELECT
+ month_name,
+ FORMAT("%'d", CAST(Profit_Men AS INTEGER)) AS Profit_Men,
+ FORMAT("%'d", CAST(Profit_Women AS INTEGER)) AS Profit_Women
+FROM
+ MonthlyData
+PIVOT
+ (SUM(profit) AS Profit FOR product_department IN ("Men", "Women"))
+ORDER BY month_number ASC
+;
--- Query: See what day of the week in each month has the greatest amount (that's the month/day to work)
-WITH WeekdayData AS
-(
-SELECT FORMAT_DATE("%B", Pickup_DateTime) AS MonthName,
- FORMAT_DATE("%m", Pickup_DateTime) AS MonthNumber,
- FORMAT_DATE("%A", Pickup_DateTime) AS WeekdayName,
- SUM(taxi_trips.Total_Amount) AS Total_Amount
- FROM `${project_id}.ds_edw.taxi_trips` AS taxi_trips
- WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01'
- AND cast(taxi_trips.payment_type as INT64) IN (1,2,3,4)
+-- Query: See what day of the week in each month has the greatest amount of sales(that's the month/day to work)
+WITH WeekdayData AS (
+ SELECT
+ FORMAT_DATE("%B", inventory.sold_at) AS month_name,
+ FORMAT_DATE("%m", inventory.sold_at) AS month_number,
+ FORMAT_DATE("%A", inventory.sold_at) AS weekday_name,
+ SUM(inventory.product_retail_price) AS revenue
+ FROM
+ `${project_id}.${dataset_id}.inventory_items` AS inventory
+ WHERE
+ inventory.sold_at IS NOT NULL
GROUP BY 1, 2, 3
)
-SELECT MonthName,
+SELECT month_name,
FORMAT("%'d", CAST(Sunday AS INTEGER)) AS Sunday,
FORMAT("%'d", CAST(Monday AS INTEGER)) AS Monday,
FORMAT("%'d", CAST(Tuesday AS INTEGER)) AS Tuesday,
@@ -149,5 +149,49 @@ SELECT MonthName,
FORMAT("%'d", CAST(Friday AS INTEGER)) AS Friday,
FORMAT("%'d", CAST(Saturday AS INTEGER)) AS Saturday,
FROM WeekdayData
- PIVOT(SUM(Total_Amount) FOR WeekdayName IN ('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'))
-ORDER BY MonthNumber;
+ PIVOT(SUM(revenue) FOR weekday_name IN ('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'))
+ORDER BY month_number
+;
+
+-- Query: Revenue pivoted by category name for each month.
+-- This query dynamically generates the pivot column names based on the distinct values in the product_category column
+EXECUTE IMMEDIATE FORMAT("""
+ with Subset AS(
+ SELECT
+ EXTRACT(MONTH FROM inventory.sold_at) AS month_number,
+ inventory.product_category,
+ inventory.product_retail_price
+ FROM
+ `${project_id}.${dataset_id}.inventory_items` AS inventory
+ WHERE
+ inventory.sold_at IS NOT NULL)
+
+ SELECT
+ CASE
+ WHEN month_number = 1 THEN 'January'
+ WHEN month_number = 2 THEN 'February'
+ WHEN month_number = 3 THEN 'March'
+ WHEN month_number = 4 THEN 'April'
+ WHEN month_number = 5 THEN 'May'
+ WHEN month_number = 6 THEN 'June'
+ WHEN month_number = 7 THEN 'July'
+ WHEN month_number = 8 THEN 'August'
+ WHEN month_number = 9 THEN 'September'
+ WHEN month_number = 10 THEN 'October'
+ WHEN month_number = 11 THEN 'November'
+ WHEN month_number = 12 THEN 'December'
+ END AS month_name,
+ * EXCEPT (month_number)
+ FROM
+ Subset
+ PIVOT (SUM(Subset.product_retail_price) as Revenue FOR product_category IN %s)
+ ORDER BY month_number;
+ """,
+ (
+ SELECT
+ CONCAT("(", STRING_AGG(DISTINCT CONCAT("'", product_category, "'"), ','), ")")
+ FROM
+ `${project_id}.${dataset_id}.inventory_items`
+ )
+)
+;
diff --git a/modules/data_warehouse/src/sql/sp_sample_translation_queries.sql b/modules/data_warehouse/src/sql/sp_sample_translation_queries.sql
index c59ede6a..fa7c410f 100644
--- a/modules/data_warehouse/src/sql/sp_sample_translation_queries.sql
+++ b/modules/data_warehouse/src/sql/sp_sample_translation_queries.sql
@@ -16,56 +16,27 @@
The queries below are examples of non-BigQuery SQL syntax that can be used with the
interactive translator to see before and after changes performed.
-The sample queries below use PostgreSQL syntax.*/
+The sample query below uses PostgreSQL syntax.*/
/* Query 1
-------------
-CREATE TABLE taxi_trips (payment_type VARCHAR, Vendor_Id VARCHAR);
-SELECT FORMAT_DATE("%w", Pickup_DateTime) AS WeekdayNumber,
- FORMAT_DATE("%A", Pickup_DateTime) AS WeekdayName,
- vendor.Vendor_Description,
- payment_type.Payment_Type_Description,
- SUM(taxi_trips.Total_Amount) AS high_value_trips
- FROM ds_edw.taxi_trips AS taxi_trips
- INNER JOIN ds_edw.vendor AS vendor
- ON cast(taxi_trips.Vendor_Id as int) = vendor.Vendor_Id
- AND taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01'
- LEFT JOIN ds_edw.payment_type AS payment_type
- ON taxi_trips.payment_type::int = payment_type.Payment_Type_Id
-GROUP BY 1, 2, 3, 4
-HAVING SUM(taxi_trips.Total_Amount) > 50
-ORDER BY WeekdayNumber, 3, 4;
+CREATE TABLE ${project_id}.${dataset_id}.inventory_items (id VARCHAR, product_id VARCHAR, created_at TIMESTAMP, sold_at TIMESTAMP, cost NUMERIC, product_category VARCHAR, product_name VARCHAR, product_brand VARCHAR, product_retail_price NUMERIC, product_department VARCHAR, product_sku VARCHAR, product_distribution_center_id VARCHAR);
+CREATE TABLE ${project_id}.${dataset_id}.order_items (id INTEGER, order_id INTEGER, user_id INTEGER, product_id INTEGER, inventory_item_id INTEGER, status VARCHAR, created_at TIMESTAMP, shipped_at TIMESTAMP, delivered_at TIMESTAMP, returned_at TIMESTAMP, sale_price NUMERIC);
+
+SELECT
+ EXTRACT(dow from order_items.created_at) AS WeekdayNumber,
+ TO_CHAR(order_items.created_at, 'DAY') AS WeekdayName,
+ inventory.product_category AS product_category,
+ COUNT(DISTINCT order_items.order_id) AS num_high_value_orders
+FROM ${project_id}.${dataset_id}.inventory_items AS inventory
+ INNER JOIN ${project_id}.${dataset_id}.order_items AS order_items
+ ON inventory.id::int = order_items.inventory_item_id
+ AND cast(inventory.product_id as int) = order_items.product_id
+ AND order_items.created_at BETWEEN TO_TIMESTAMP('2022-01-01','YYYY-MM-DD') AND TO_TIMESTAMP('2022-12-31','YYYY-MM-DD')
+GROUP BY 1, 2, 3
+HAVING AVG(order_items.sale_price) > 85;
*/
-/* Query 2
--------------
-CREATE TABLE taxi_trips (payment_type VARCHAR, Vendor_Id VARCHAR);
-
-WITH TaxiDataRanking AS
-(
-SELECT CAST(Pickup_DateTime AS DATE) AS Pickup_Date,
- taxi_trips.payment_type as Payment_Type_Id,
- taxi_trips.Passenger_Count,
- taxi_trips.Total_Amount,
- RANK() OVER (PARTITION BY CAST(Pickup_DateTime AS DATE),
- taxi_trips.payment_type
- ORDER BY taxi_trips.Passenger_Count DESC,
- taxi_trips.Total_Amount DESC) AS Ranking
- FROM ds_edw.taxi_trips AS taxi_trips
-WHERE taxi_trips.Pickup_DateTime BETWEEN '2022-01-01' AND '2022-02-01'
- AND taxi_trips.payment_type::int IN (1,2)
-)
-SELECT Pickup_Date,
- Payment_Type_Description,
- Passenger_Count,
- Total_Amount
- FROM TaxiDataRanking
- INNER JOIN ds_edw.payment_type AS payment_type
- ON TaxiDataRanking.Payment_Type_Id = payment_type.Payment_Type_Id
-WHERE Ranking = 1
-ORDER BY Pickup_Date, Payment_Type_Description;
-*/
-
SELECT 'OPEN THE STORED PROCEDURE FOR MORE DETAILS TO USE THE TRANSLATION SERVICE' as sql_text;
diff --git a/modules/data_warehouse/src/taxi_trips_schema.json b/modules/data_warehouse/src/taxi_trips_schema.json
deleted file mode 100644
index 5bf80035..00000000
--- a/modules/data_warehouse/src/taxi_trips_schema.json
+++ /dev/null
@@ -1,116 +0,0 @@
-[
- {
- "name": "vendor_id",
- "type": "STRING",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "pickup_datetime",
- "type": "TIMESTAMP",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "dropoff_datetime",
- "type": "TIMESTAMP",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "passenger_count",
- "type": "INTEGER",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "trip_distance",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "rate_code",
- "type": "STRING",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "store_and_fwd_flag",
- "type": "STRING",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "payment_type",
- "type": "STRING",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "fare_amount",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "extra",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "mta_tax",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "tip_amount",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "tolls_amount",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "imp_surcharge",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "airport_fee",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "total_amount",
- "type": "NUMERIC",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "pickup_location_id",
- "type": "STRING",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "data_file_year",
- "type": "INTEGER",
- "mode": "NULLABLE",
- "description": ""
- },
- {
- "name": "data_file_month",
- "type": "INTEGER",
- "mode": "NULLABLE",
- "description": ""
- }
- ]
diff --git a/modules/data_warehouse/templates/workflow.tftpl b/modules/data_warehouse/templates/workflow.tftpl
index b39338a6..51ff0ee3 100644
--- a/modules/data_warehouse/templates/workflow.tftpl
+++ b/modules/data_warehouse/templates/workflow.tftpl
@@ -33,12 +33,13 @@ copy_objects:
assign:
- source_bucket: "data-analytics-demos"
- dest_bucket: ${raw_bucket}
+ - dataset_id: ${dataset_id}
- copied_objects: []
- list_objects:
call: googleapis.storage.v1.objects.list
args:
bucket: $${source_bucket}
- prefix: "new-york-taxi-trips/tlc-yellow-trips-2022"
+ prefix: "thelook-ecommerce"
result: list_result
- start_counter:
assign:
@@ -81,8 +82,8 @@ create_tables:
- results: {}
- project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
- map:
- 1: $${"CALL `"+project_id+".ds_edw.sp_provision_lookup_tables`();"}
- 2: $${"CALL `"+project_id+".ds_edw.sp_lookerstudio_report`();"}
+ 1: $${"CALL `"+project_id+".dataset_id.sp_provision_lookup_tables`();"}
+ 2: $${"CALL `"+project_id+".dataset_id.sp_lookerstudio_report`();"}
- loopStepTables:
for:
value: key
diff --git a/modules/data_warehouse/workflows.tf b/modules/data_warehouse/workflows.tf
index c8835944..81973415 100644
--- a/modules/data_warehouse/workflows.tf
+++ b/modules/data_warehouse/workflows.tf
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-# Set up Workflows service account
-# # Set up the Workflows service account
+# Set up the Workflows service account
resource "google_service_account" "workflow_service_account" {
project = module.project-services.project_id
account_id = "cloud-workflow-sa-${random_id.id.hex}"
@@ -32,12 +31,11 @@ resource "google_project_iam_member" "workflow_service_account_roles" {
"roles/bigquery.connectionUser",
"roles/bigquery.jobUser",
"roles/bigquery.dataEditor",
- ])
-
+ ]
+ )
project = module.project-services.project_id
role = each.key
member = "serviceAccount:${google_service_account.workflow_service_account.email}"
-
}
# # Create the workflow
@@ -49,7 +47,8 @@ resource "google_workflows_workflow" "workflow" {
service_account = google_service_account.workflow_service_account.id
source_contents = templatefile("${path.module}/templates/workflow.tftpl", {
- raw_bucket = google_storage_bucket.raw_bucket.name
+ raw_bucket = google_storage_bucket.raw_bucket.name,
+ dataset_id = google_bigquery_dataset.ds_edw.dataset_id
})
labels = var.labels