Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add customattributes to sagemaker #405

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions builds/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Builds

## How to use
There are two different packages, clean and enhanced.

1. *clean* - let user input a question to the model
2. *enhanced* - let user twick context and ask question

### Manual deployment

1. Open jupyter.syapse.com
2. drug folder *clean* or *enhanced* to the jupyter env.
3. In the jupyter open your folder
4. Open notebook file and run each cell.


### Create new deployemnt

1. Make changes in the `packages/jupyter-ai-magics/jupyter-ai-magics`
2. make you r code changes to the providesr.py or any other files there
3. Run `python -m build` to create a build
70 changes: 70 additions & 0 deletions builds/clean/jupyter_ai_clean.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "dd9c343d-8f2b-482f-a788-524d6376b14a",
"metadata": {},
"outputs": [],
"source": [
"!pip install s3fs && pip install jupyter_ai[all] && pip install ./jupyter_ai_magics-2.2.17-py3-none-any.whl --no-cache-dir --force-reinstall"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "dd77a9de-45e7-44a5-8920-675b53bddf62",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import boto3\n",
"from jupyter_ai_magics.utils import explode_columns, load_make_plots\n",
"region=\"us-west-2\"\n",
"# load jupyter_ai\n",
"%load_ext jupyter_ai\n",
"# Register alias\n",
"%ai register raydar-ai sagemaker-endpoint:meta-textgeneration-llama-2-7b-2023-09-20-19-24-48-464\n",
"# load Breast ADS\n",
"breast_ads_orig = pd.read_parquet(\n",
" \"s3://syapse-deidentify-emr-data/deidentify/ads/breast.parquet\",\n",
" columns=['birth_date_year', 'has_family_history_cancer', 'marital_status', 'brca_reported_status', 'deceased_date', 'diagnosis_date', 'diagnosis_date_year', 'er_status_diagnosis', 'ethnicity', 'grade', 'her2_status_diagnosis', 'histology_group', 'icdo3_topography', 'insurance_status_dx', 'menopausal_status_diagnosis', 'packsperday', 'patientid', 'pr_status_diagnosis', 'prioritized_stage_group_dx', 'race', 'region', 'sex', 'smoking_pack_years', 'smoking_years', 'vital_status', 'systemic_therapy', 'her2_biomarker_data', 'hrd_biomarker_data'])\n",
"breast_ads_orig['patientId'] = breast_ads_orig['patientid']\n",
"make_plots = load_make_plots(breast_ads_orig)\n",
"breast_ads = explode_columns(breast_ads_orig, ['systemic_therapy', 'her2_biomarker_data', 'hrd_biomarker_data'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "620d974e-eba1-47e8-a505-1a1f0656cc62",
"metadata": {},
"outputs": [],
"source": [
"%%ai raydar-ai --region-name=us-west-2 --request-schema={\"inputs\":\"<prompt>\"} --response-path=[0].generation -f code\n",
"Find breast cacer patients that are HER2 negative and create plots"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file not shown.
83 changes: 83 additions & 0 deletions builds/enhanced/jupyter_ai.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "8fd5be27-6d1a-4138-bf3e-c0df902a4462",
"metadata": {},
"outputs": [],
"source": [
"!pip install s3fs && jupyter_ai[all] && pip install ./jupyter_ai_magics-2.2.10-py3-none-any.whl --no-cache-dir"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3343f17d-c6a9-4a92-8685-dba9019854ed",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import boto3\n",
"region=\"us-west-2\"\n",
"# load jupyter_ai\n",
"%load_ext jupyter_ai\n",
"# Register alias\n",
"%ai register raydar-ai sagemaker-endpoint:meta-textgeneration-llama-2-7b-2023-09-19-20-22-39-442\n",
"# load Breast ADS\n",
"breast_ads = pd.read_parquet(\"s3://syapse-deidentify-emr-data/deidentify/ads/breast.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec07f295-25bd-4903-894e-30d9f6ccd806",
"metadata": {},
"outputs": [],
"source": [
"%%ai raydar-ai --region-name=us-west-2 --request-schema={\"inputs\":\"<prompt>\"} --response-path=[0].generation -f code\n",
"Below is an instruction that describes a task, paired with an input that provides further context.\n",
"Write a response that appropriately completes the request.\n",
"### Instruction:\n",
" find me patients who are female\n",
"### Input:\n",
" This is data about patients with breast cancer.\n",
" breast_ads is a pandas dataframe with a list of columns ['birth_date', 'deceased_date', 'sex', 'race', 'ethnicity', 'vital_status','diagnosis_date', 'er_status_diagnosis', 'pr_status_diagnosis','her2_status_diagnosis', 'brca_reported_status']\n",
" \n",
" Below is the description of each column\n",
" birth_date column Patient's year of birth.\n",
" deceased_date column is the date of death of a patient if the patient is dead. It is empty if the patient is alive\n",
" sex column is Patient's sex as recorded in EMR at diagnosis. \n",
" race column is Primary race as recorded in patient EMR at diagnosis. \n",
" ethnicity column is Ethnicity as recorded in patient EMR at diagnosis. \n",
" vital_status column is Patient's vital status based on the composite mortality score. If no evidence of death, patient will be classified as Alive. \n",
" diagnosis_date column is Date (year, month, day) of initial breast cancer diagnosis.\n",
" er_status_diagnosis column is the Results from test closest to the date of initial diagnosis and up to 14 days after. Categorized as Negative, Low positive, Positive, Equivocal, Unknown. \n",
" pr_status_diagnosis column is the Results from test closest to the date of initial diagnosis and up to 14 days after. \n",
" her2_status_diagnosis column Results from test at the time of initial diagnosis and up to 45 days after. \n",
" brca_reported_status column is Germline or somatic BRCA1/2 mutation (ever). Categories are positive, negative and unknown"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file not shown.
29 changes: 28 additions & 1 deletion packages/jupyter-ai-magics/jupyter_ai_magics/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,28 @@ class AzureChatOpenAIProvider(BaseProvider, AzureChatOpenAI):
class JsonContentHandler(LLMContentHandler):
content_type = "application/json"
accepts = "application/json"
context = """This is data about patients with breast cancer.
breast_ads is a pandas dataframe with a list of columns ['birth_date', 'deceased_date', 'sex', 'race', 'ethnicity', 'vital_status','diagnosis_date', 'er_status_diagnosis', 'pr_status_diagnosis','her2_status_diagnosis', 'brca_reported_status']

Below is the description of each column
birth_date column Patient's year of birth.
deceased_date column is the date of death of a patient if the patient is dead. It is empty if the patient is alive
sex column is Patient's sex as recorded in EMR at diagnosis.
race column is Primary race as recorded in patient EMR at diagnosis.
ethnicity column is Ethnicity as recorded in patient EMR at diagnosis.
vital_status column is Patient's vital status based on the composite mortality score. If no evidence of death, patient will be classified as Alive.
diagnosis_date column is Date (year, month, day) of initial breast cancer diagnosis.
er_status_diagnosis column is the Results from test closest to the date of initial diagnosis and up to 14 days after. Categorized as Negative, Low positive, Positive, Equivocal, Unknown.
pr_status_diagnosis column is the Results from test closest to the date of initial diagnosis and up to 14 days after.
her2_status_diagnosis column Results from test at the time of initial diagnosis and up to 45 days after.
brca_reported_status column is Germline or somatic BRCA1/2 mutation (ever). Categories are positive, negative and unknown"""
template = {
"prompt": "Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{context}\n\n",
"completion": " {response}",
}
input_output_demarkation_key = "\n\n### Response:\n"

def __init__(self, request_schema, response_path):
self.request_schema = json.loads(request_schema)
Expand All @@ -523,7 +545,11 @@ def replace_values(self, old_val, new_val, d: Dict[str, Any]):

def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
request_obj = copy.deepcopy(self.request_schema)
self.replace_values("<prompt>", prompt, request_obj)
input_prompt = self.template["prompt"].format(
instruction=prompt, context=self.context
) + self.input_output_demarkation_key
self.replace_values("<prompt>", input_prompt, request_obj)
request_obj['parameters'] = {"max_new_tokens": 100}
request = json.dumps(request_obj).encode("utf-8")
return request

Expand Down Expand Up @@ -566,6 +592,7 @@ def __init__(self, *args, **kwargs):
content_handler = JsonContentHandler(
request_schema=request_schema, response_path=response_path
)
kwargs["endpoint_kwargs"] = {"CustomAttributes": "accept_eula=true"}
super().__init__(*args, **kwargs, content_handler=content_handler)

async def _acall(self, *args, **kwargs) -> Coroutine[Any, Any, str]:
Expand Down
91 changes: 91 additions & 0 deletions packages/jupyter-ai-magics/jupyter_ai_magics/utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,109 @@
import logging
from typing import Dict, Optional, Tuple, Type, Union
from pandas import json_normalize

from importlib_metadata import entry_points
from jupyter_ai_magics.aliases import MODEL_ID_ALIASES
from jupyter_ai_magics.embedding_providers import BaseEmbeddingsProvider
from jupyter_ai_magics.providers import BaseProvider

import plotly.express as px
import plotly.graph_objects as go
import pandas as pd


Logger = Union[logging.Logger, logging.LoggerAdapter]
LmProvidersDict = Dict[str, BaseProvider]
EmProvidersDict = Dict[str, BaseEmbeddingsProvider]
AnyProvider = Union[BaseProvider, BaseEmbeddingsProvider]
ProviderDict = Dict[str, AnyProvider]


def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
# maximum of 6 value cols -> 6 colors
colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
labelList = []
colorNumList = []
for catCol in cat_cols:
labelListTemp = list(set(df[catCol].values))
colorNumList.append(len(labelListTemp))
labelList = labelList + labelListTemp

# remove duplicates from labelList
labelList = list(dict.fromkeys(labelList))

# define colors based on number of levels
colorList = []
for idx, colorNum in enumerate(colorNumList):
colorList = colorList + [colorPalette[idx]]*colorNum

# transform df into a source-target pair
for i in range(len(cat_cols)-1):
if i==0:
sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
sourceTargetDf.columns = ['source','target','count']
else:
tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
tempDf.columns = ['source','target','count']
sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()

# add index for source-target pair
sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

# creating the sankey diagram
data = dict(
type='sankey',
node = dict(
pad = 15,
thickness = 20,
line = dict(
color = "black",
width = 0.5
),
label = labelList,
color = colorList
),
link = dict(
source = sourceTargetDf['sourceID'],
target = sourceTargetDf['targetID'],
value = sourceTargetDf['count']
)
)

layout = dict(
title = title,
font = dict(
size = 10
)
)

fig = dict(data=[data], layout=layout)
return go.Figure(fig)

def load_make_plots(breast_ads_orig):
def make_plots(patients):
breast_ads_plot = breast_ads_orig[breast_ads_orig['patientId'].isin( patients)]
fig = px.histogram(breast_ads_plot, x="diagnosis_date")
fig.show()
breast_ads_plot['count']=1
fig = genSankey(breast_ads_plot, cat_cols=['region','race','icdo3_topography' ],value_cols='count', title="Population Characteristics")
fig.show()
return make_plots

def explode_column(df, col_name):
df2 = df.explode(col_name)
df3 = json_normalize(df2[col_name].tolist())
return df2.join(df3)

def explode_columns(df, columns):
new_df = df
for col_name in columns:
new_df = explode_column(new_df, col_name)
return new_df


def get_lm_providers(log: Optional[Logger] = None) -> LmProvidersDict:
if not log:
log = logging.getLogger()
Expand Down
2 changes: 1 addition & 1 deletion packages/jupyter-ai-magics/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@jupyter-ai/magics",
"version": "2.2.0",
"version": "2.2.17",
"description": "Jupyter AI magics Python package. Not published on NPM.",
"private": true,
"homepage": "https://github.com/jupyterlab/jupyter-ai",
Expand Down