Skip to content

Commit

Permalink
Ai examples (#79)
Browse files Browse the repository at this point in the history
* Added ai example files

* Updated Readme

* Added formatting changes

* Updated examples

* Added formatting changes

* Removed unneeded lines

* Made fixes

* Added changes to call open ai by mlflow gateway and creating gateway routes

* Removed extra code and added example notebook for creating mlflow gateway routes

* formatting changes

* fixed name

* Updates notes

* Made changes as per Review comments

---------

Co-authored-by: souravg-db <souravg-db>
  • Loading branch information
souravg-db authored Oct 12, 2023
1 parent 7c462df commit c73465d
Show file tree
Hide file tree
Showing 4 changed files with 393 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ Operations are applied concurrently across multiple tables
* Visualise quantity of data written per table per period
* **Governance**
* PII detection with Presidio ([example notebook](examples/pii_detection_presidio.py))
* Text Analysis with MosaicML and Databricks MLflow ([example notebook](examples/text_analysis_mosaicml_mlflow.py))
* Text Analysis with OpenAI GPT ([example notebook](examples/text_analysis_openai_gpt.py))
* [GDPR right of access: extract user data from all tables at once](docs/GDPR_RoA.md)
* [GDPR right of erasure: delete user data from all tables at once](docs/GDPR_RoE.md)
* [Search in any column](docs/Search.md)
Expand All @@ -26,6 +28,7 @@ Operations are applied concurrently across multiple tables
* [Delete data based on semantic classes](docs/Delete_by_class.md)
* **Custom**
* [Arbitrary SQL template execution across multiple tables](docs/Arbitrary_multi-table_SQL.md)
* Create Mlflow gateway routes for MosaicML and OpenAI ([example notebook](examples/mlflow_gateway_routes_examples.py))

## Getting started

Expand Down
110 changes: 110 additions & 0 deletions examples/mlflow_gateway_routes_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Databricks notebook source
# MAGIC %md
# MAGIC #Create MLflow Gateway Routes for MosaicML & OpenAI
# MAGIC This notebook provides examples of creating mlflow gateway routes for MosaicML & OpenAI
# MAGIC
# MAGIC **NOTE**:
# MAGIC - This notebook requires >= DBR 13.3 LTS ML Runtime
# MAGIC - Please refer to [configuring-the-ai-gateway](https://mlflow.org/docs/latest/gateway/index.html#configuring-the-ai-gateway) for more info

# COMMAND ----------

# MAGIC %md
# MAGIC ### Install dependencies

# COMMAND ----------

# MAGIC %pip install mlflow[gateway]
# MAGIC dbutils.library.restartPython()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Setup widgets

# COMMAND ----------

dbutils.widgets.text("moasicml_route_name", "discoverx-mosaicml-llama2-70b-completions", "mosaicml route name")
dbutils.widgets.text("openai_route_name", "discoverx-openai-gpt-3.5-completions", "openai route name")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Import required libs and initialize variables

# COMMAND ----------

import mlflow
from mlflow import gateway

# COMMAND ----------

moasicml_route_name = dbutils.widgets.get("moasicml_route_name")
openai_route_name = dbutils.widgets.get("openai_route_name")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Create MLflow gateway route for MosaicML (llama2 model)

# COMMAND ----------

# get or create mosaicml route
import mlflow
from mlflow import gateway

gateway.set_gateway_uri(gateway_uri="databricks")

try:
route = gateway.get_route(moasicml_route_name)
except:
# Create a route for embeddings with MosaicML
print(f"Creating the route {moasicml_route_name}")
print(
gateway.create_route(
name=moasicml_route_name,
route_type="llm/v1/completions",
model={
"name": "llama2-70b-chat",
"provider": "mosaicml",
"mosaicml_config": {
"mosaicml_api_key": dbutils.secrets.get(scope="discoverx", key="mosaic_ml_api_key")
},
},
)
)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Create MLflow gateway route for Open AI (GPT 3.5 model)

# COMMAND ----------

# get or create openai route
import mlflow
from mlflow import gateway

gateway.set_gateway_uri(gateway_uri="databricks")
try:
route = gateway.get_route(openai_route_name)
except:
# Create a route for embeddings with OpenAI
print(f"Creating the route {openai_route_name}")
print(
gateway.create_route(
name=openai_route_name,
route_type="llm/v1/completions",
model={
"name": "gpt-35-turbo",
"provider": "openai",
"openai_config": {
"openai_api_key": dbutils.secrets.get(scope="discoverx", key="openaikey"),
"openai_api_base": dbutils.secrets.get(scope="discoverx", key="openaibase"),
"openai_deployment_name": dbutils.secrets.get(scope="discoverx", key="openai_deployment_name"),
"openai_api_type": "azure",
"openai_api_version": "2023-05-15",
},
},
)
)
139 changes: 139 additions & 0 deletions examples/text_analysis_mosaicml_mlflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Databricks notebook source
# MAGIC %md
# MAGIC # Text analysis with DiscoverX, MosaicML & Databricks MLflow
# MAGIC
# MAGIC This notebooks uses [DiscoverX](https://github.com/databrickslabs/discoverx) to analyze text with [MosiacML](https://www.mosaicml.com/blog/llama2-inference) over a set of tables in Unity Catalog.
# MAGIC
# MAGIC The notebook will:
# MAGIC 1. Use DiscoverX to sample a set of tables from Unity Catalog and unpivot all string columns into a long format dataset
# MAGIC 2. Run text analysis with MosaicML llama2-70b model & Databricks MLflow
# MAGIC
# MAGIC **NOTE**:
# MAGIC - This notebook requires >= DBR 13.3 LTS ML Runtime
# MAGIC - This notebook requires Mlflow gateway route for MosaicML. For examples of creating routes, please refer to the [README.md](https://github.com/databrickslabs/discoverx/blob/master/README.md) file.
# MAGIC - For detailed information about the cost of API hits, please refer to [MosaicML Inference](https://www.mosaicml.com/inference)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Install dependencies

# COMMAND ----------

# MAGIC %pip install mlflow[gateway]
# MAGIC dbutils.library.restartPython()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Setup widgets

# COMMAND ----------

dbutils.widgets.text("from_tables", "discoverx_sample.*.*", "from tables")
dbutils.widgets.text("moasicml_route_name", "discoverx-mosaicml-llama2-70b-completions", "mosaicml route name")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Import required libs and initialize variables

# COMMAND ----------

import pandas as pd
from pyspark.sql.functions import (
pandas_udf,
col,
concat,
lit,
explode,
count,
avg,
min,
max,
sum,
collect_set,
concat_ws,
)
from pyspark.sql.types import ArrayType, StringType, StructType, FloatType, StructField
from typing import Iterator

# COMMAND ----------

from_tables = dbutils.widgets.get("from_tables")
moasicml_route_name = dbutils.widgets.get("moasicml_route_name")

# Set the sample rows size
sample_size = 100

# COMMAND ----------

# MAGIC %md
# MAGIC ## Initialize discoverx

# COMMAND ----------

from discoverx import DX

dx = DX()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Transform all sampled tables

# COMMAND ----------

unpivoted_df = (
dx.from_tables(from_tables)
.unpivot_string_columns(sample_size=sample_size)
.apply()
.localCheckpoint() # Checkpointing to reduce the query plan size
)

# COMMAND ----------

display(unpivoted_df)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Define udf to use MosiacML model

# COMMAND ----------

import mlflow
from mlflow import gateway


@pandas_udf(StringType())
def predict_value_udf(s):
def predict_value(s):
data = {
"prompt": f""" [INST]
<<SYS>>
Reply with either YES or NO
<</SYS>>
Is this news article related to aquisition/merger ?
News Article: {s}
[/INST]
"""
}
r = mlflow.gateway.query(route=moasicml_route_name, data=data)
return r["candidates"][0]["text"]

return s.apply(predict_value)


# COMMAND ----------

# MAGIC %md
# MAGIC ### Run Predictions

# COMMAND ----------

df_with_prediction = unpivoted_df.withColumn("is_realted_to_aquisition", predict_value_udf(col("string_value")))

# COMMAND ----------

display(df_with_prediction)
Loading

0 comments on commit c73465d

Please sign in to comment.