jupyterlab · vnazarv · Sep 19, 2023 · Sep 19, 2023 · Sep 19, 2023 · Sep 20, 2023
diff --git a/builds/README.md b/builds/README.md
@@ -0,0 +1,21 @@
+# Builds
+
+## How to use
+There are two different packages, clean and enhanced.
+
+1. *clean* - let user input a question to the model
+2. *enhanced* - let user twick context and ask question
+
+### Manual deployment
+
+1. Open jupyter.syapse.com
+2. drug folder *clean* or *enhanced* to the jupyter env.
+3. In the jupyter open your folder
+4. Open notebook file and run each cell.
+
+
+### Create new deployemnt
+
+1. Make changes in the `packages/jupyter-ai-magics/jupyter-ai-magics`
+2. make you r code changes to the providesr.py or any other files there
+3. Run `python -m build` to create a build
diff --git a/builds/clean/jupyter_ai_clean.ipynb b/builds/clean/jupyter_ai_clean.ipynb
@@ -0,0 +1,70 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd9c343d-8f2b-482f-a788-524d6376b14a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install s3fs && pip install jupyter_ai[all] && pip install ./jupyter_ai_magics-2.2.17-py3-none-any.whl --no-cache-dir --force-reinstall"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "dd77a9de-45e7-44a5-8920-675b53bddf62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import boto3\n",
+    "from jupyter_ai_magics.utils import explode_columns, load_make_plots\n",
+    "region=\"us-west-2\"\n",
+    "# load jupyter_ai\n",
+    "%load_ext jupyter_ai\n",
+    "# Register alias\n",
+    "%ai register raydar-ai sagemaker-endpoint:meta-textgeneration-llama-2-7b-2023-09-20-19-24-48-464\n",
+    "# load Breast ADS\n",
+    "breast_ads_orig = pd.read_parquet(\n",
+    "    \"s3://syapse-deidentify-emr-data/deidentify/ads/breast.parquet\",\n",
+    "    columns=['birth_date_year', 'has_family_history_cancer', 'marital_status', 'brca_reported_status', 'deceased_date', 'diagnosis_date', 'diagnosis_date_year', 'er_status_diagnosis', 'ethnicity', 'grade', 'her2_status_diagnosis', 'histology_group', 'icdo3_topography', 'insurance_status_dx', 'menopausal_status_diagnosis', 'packsperday', 'patientid', 'pr_status_diagnosis', 'prioritized_stage_group_dx', 'race', 'region', 'sex', 'smoking_pack_years', 'smoking_years', 'vital_status', 'systemic_therapy', 'her2_biomarker_data', 'hrd_biomarker_data'])\n",
+    "breast_ads_orig['patientId'] = breast_ads_orig['patientid']\n",
+    "make_plots = load_make_plots(breast_ads_orig)\n",
+    "breast_ads = explode_columns(breast_ads_orig, ['systemic_therapy', 'her2_biomarker_data', 'hrd_biomarker_data'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "620d974e-eba1-47e8-a505-1a1f0656cc62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%ai raydar-ai --region-name=us-west-2 --request-schema={\"inputs\":\"<prompt>\"} --response-path=[0].generation -f code\n",
+    "Find breast cacer patients that are HER2 negative and create plots"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/builds/clean/jupyter_ai_magics-2.2.17-py3-none-any.whl b/builds/clean/jupyter_ai_magics-2.2.17-py3-none-any.whl
diff --git a/builds/enhanced/jupyter_ai.ipynb b/builds/enhanced/jupyter_ai.ipynb
@@ -0,0 +1,83 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8fd5be27-6d1a-4138-bf3e-c0df902a4462",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install s3fs && jupyter_ai[all] && pip install ./jupyter_ai_magics-2.2.10-py3-none-any.whl --no-cache-dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3343f17d-c6a9-4a92-8685-dba9019854ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import boto3\n",
+    "region=\"us-west-2\"\n",
+    "# load jupyter_ai\n",
+    "%load_ext jupyter_ai\n",
+    "# Register alias\n",
+    "%ai register raydar-ai sagemaker-endpoint:meta-textgeneration-llama-2-7b-2023-09-19-20-22-39-442\n",
+    "# load Breast ADS\n",
+    "breast_ads = pd.read_parquet(\"s3://syapse-deidentify-emr-data/deidentify/ads/breast.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec07f295-25bd-4903-894e-30d9f6ccd806",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%ai raydar-ai --region-name=us-west-2 --request-schema={\"inputs\":\"<prompt>\"} --response-path=[0].generation -f code\n",
+    "Below is an instruction that describes a task, paired with an input that provides further context.\n",
+    "Write a response that appropriately completes the request.\n",
+    "### Instruction:\n",
+    "    find me patients who are female\n",
+    "### Input:\n",
+    "    This is data about patients with breast cancer.\n",
+    "    breast_ads is a pandas dataframe with a list of columns ['birth_date', 'deceased_date', 'sex', 'race', 'ethnicity', 'vital_status','diagnosis_date', 'er_status_diagnosis', 'pr_status_diagnosis','her2_status_diagnosis', 'brca_reported_status']\n",
+    "    \n",
+    "    Below is the description of each column\n",
+    "    birth_date column Patient's year of birth.\n",
+    "    deceased_date column is the date of death of a patient if the patient is dead. It is empty if the patient is alive\n",
+    "    sex column is Patient's sex as recorded in EMR at diagnosis. \n",
+    "    race column is Primary race as recorded in patient EMR at diagnosis. \n",
+    "    ethnicity column is Ethnicity as recorded in patient EMR at diagnosis. \n",
+    "    vital_status column is Patient's vital status based on the composite mortality score. If no evidence of death, patient will be classified as Alive. \n",
+    "    diagnosis_date column is Date (year, month, day) of initial breast cancer diagnosis.\n",
+    "    er_status_diagnosis column is the Results from test closest to the date of initial diagnosis and up to 14 days after. Categorized as Negative, Low positive, Positive, Equivocal, Unknown. \n",
+    "    pr_status_diagnosis column is the Results from test closest to the date of initial diagnosis and up to 14 days after. \n",
+    "    her2_status_diagnosis column Results from test at the time of initial diagnosis and up to 45 days after.  \n",
+    "    brca_reported_status column is Germline or somatic BRCA1/2 mutation (ever). Categories are positive, negative and unknown"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/builds/enhanced/jupyter_ai_magics-2.2.10-py3-none-any.whl b/builds/enhanced/jupyter_ai_magics-2.2.10-py3-none-any.whl
diff --git a/packages/jupyter-ai-magics/jupyter_ai_magics/providers.py b/packages/jupyter-ai-magics/jupyter_ai_magics/providers.py
@@ -505,6 +505,28 @@ class AzureChatOpenAIProvider(BaseProvider, AzureChatOpenAI):
 class JsonContentHandler(LLMContentHandler):
     content_type = "application/json"
     accepts = "application/json"
+    context = """This is data about patients with breast cancer.
+breast_ads is a pandas dataframe with a list of columns ['birth_date', 'deceased_date', 'sex', 'race', 'ethnicity', 'vital_status','diagnosis_date', 'er_status_diagnosis', 'pr_status_diagnosis','her2_status_diagnosis', 'brca_reported_status']
+
+Below is the description of each column
+birth_date column Patient's year of birth.
+deceased_date column is the date of death of a patient if the patient is dead. It is empty if the patient is alive
+sex column is Patient's sex as recorded in EMR at diagnosis. 
+race column is Primary race as recorded in patient EMR at diagnosis. 
+ethnicity column is Ethnicity as recorded in patient EMR at diagnosis. 
+vital_status column is Patient's vital status based on the composite mortality score. If no evidence of death, patient will be classified as Alive. 
+diagnosis_date column is Date (year, month, day) of initial breast cancer diagnosis.
+er_status_diagnosis column is the Results from test closest to the date of initial diagnosis and up to 14 days after. Categorized as Negative, Low positive, Positive, Equivocal, Unknown. 
+pr_status_diagnosis column is the Results from test closest to the date of initial diagnosis and up to 14 days after. 
+her2_status_diagnosis column Results from test at the time of initial diagnosis and up to 45 days after.  
+brca_reported_status column is Germline or somatic BRCA1/2 mutation (ever). Categories are positive, negative and unknown"""
+    template = {
+        "prompt": "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{context}\n\n",
+        "completion": " {response}",
+    }
+    input_output_demarkation_key = "\n\n### Response:\n"
 
     def __init__(self, request_schema, response_path):
         self.request_schema = json.loads(request_schema)
@@ -523,7 +545,11 @@ def replace_values(self, old_val, new_val, d: Dict[str, Any]):
 
     def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
         request_obj = copy.deepcopy(self.request_schema)
-        self.replace_values("<prompt>", prompt, request_obj)
+        input_prompt = self.template["prompt"].format(
+            instruction=prompt, context=self.context
+        ) + self.input_output_demarkation_key
+        self.replace_values("<prompt>", input_prompt, request_obj)
+        request_obj['parameters'] = {"max_new_tokens": 100}
         request = json.dumps(request_obj).encode("utf-8")
         return request
 
@@ -566,6 +592,7 @@ def __init__(self, *args, **kwargs):
         content_handler = JsonContentHandler(
             request_schema=request_schema, response_path=response_path
         )
+        kwargs["endpoint_kwargs"] = {"CustomAttributes": "accept_eula=true"}
         super().__init__(*args, **kwargs, content_handler=content_handler)
 
     async def _acall(self, *args, **kwargs) -> Coroutine[Any, Any, str]:

diff --git a/packages/jupyter-ai-magics/jupyter_ai_magics/utils.py b/packages/jupyter-ai-magics/jupyter_ai_magics/utils.py
@@ -1,18 +1,109 @@
 import logging
 from typing import Dict, Optional, Tuple, Type, Union
+from pandas import json_normalize
 
 from importlib_metadata import entry_points
 from jupyter_ai_magics.aliases import MODEL_ID_ALIASES
 from jupyter_ai_magics.embedding_providers import BaseEmbeddingsProvider
 from jupyter_ai_magics.providers import BaseProvider
 
+import plotly.express as px
+import plotly.graph_objects as go
+import pandas as pd
+
+
 Logger = Union[logging.Logger, logging.LoggerAdapter]
 LmProvidersDict = Dict[str, BaseProvider]
 EmProvidersDict = Dict[str, BaseEmbeddingsProvider]
 AnyProvider = Union[BaseProvider, BaseEmbeddingsProvider]
 ProviderDict = Dict[str, AnyProvider]
 
 
+def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
+    # maximum of 6 value cols -> 6 colors
+    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
+    labelList = []
+    colorNumList = []
+    for catCol in cat_cols:
+        labelListTemp =  list(set(df[catCol].values))
+        colorNumList.append(len(labelListTemp))
+        labelList = labelList + labelListTemp
+
+    # remove duplicates from labelList
+    labelList = list(dict.fromkeys(labelList))
+
+    # define colors based on number of levels
+    colorList = []
+    for idx, colorNum in enumerate(colorNumList):
+        colorList = colorList + [colorPalette[idx]]*colorNum
+
+    # transform df into a source-target pair
+    for i in range(len(cat_cols)-1):
+        if i==0:
+            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
+            sourceTargetDf.columns = ['source','target','count']
+        else:
+            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
+            tempDf.columns = ['source','target','count']
+            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
+        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
+
+    # add index for source-target pair
+    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
+    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
+
+    # creating the sankey diagram
+    data = dict(
+        type='sankey',
+        node = dict(
+          pad = 15,
+          thickness = 20,
+          line = dict(
+            color = "black",
+            width = 0.5
+          ),
+          label = labelList,
+          color = colorList
+        ),
+        link = dict(
+          source = sourceTargetDf['sourceID'],
+          target = sourceTargetDf['targetID'],
+          value = sourceTargetDf['count']
+        )
+      )
+
+    layout =  dict(
+        title = title,
+        font = dict(
+          size = 10
+        )
+    )
+
+    fig = dict(data=[data], layout=layout)
+    return go.Figure(fig)
+
+def load_make_plots(breast_ads_orig):
+    def make_plots(patients):
+        breast_ads_plot = breast_ads_orig[breast_ads_orig['patientId'].isin( patients)]
+        fig = px.histogram(breast_ads_plot, x="diagnosis_date")
+        fig.show()
+        breast_ads_plot['count']=1
+        fig = genSankey(breast_ads_plot, cat_cols=['region','race','icdo3_topography' ],value_cols='count', title="Population Characteristics")
+        fig.show()
+    return make_plots
+
+def explode_column(df, col_name):
+    df2 = df.explode(col_name)
+    df3 = json_normalize(df2[col_name].tolist())
+    return df2.join(df3)
+
+def explode_columns(df, columns):
+    new_df = df
+    for col_name in columns:
+        new_df = explode_column(new_df, col_name)
+    return new_df
+
+
 def get_lm_providers(log: Optional[Logger] = None) -> LmProvidersDict:
     if not log:
         log = logging.getLogger()

diff --git a/packages/jupyter-ai-magics/package.json b/packages/jupyter-ai-magics/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@jupyter-ai/magics",
-  "version": "2.2.0",
+  "version": "2.2.17",
   "description": "Jupyter AI magics Python package. Not published on NPM.",
   "private": true,
   "homepage": "https://github.com/jupyterlab/jupyter-ai",