biopragmatics · cthoyt · Sep 9, 2023 · Sep 9, 2023 · Sep 9, 2023 · Sep 11, 2023
diff --git a/notebooks/Data Science Demo.ipynb b/notebooks/Data Science Demo.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "cfb169ac-d6e7-4132-9ffc-a14edf8a918f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import curies\n",
+    "import pandas as pd\n",
+    "import itertools as itt\n",
+    "import pystow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3091dc17-b60d-4cc1-94a9-c523b3cce4e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 185 ms, sys: 108 ms, total: 293 ms\n",
+      "Wall time: 917 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "obo_converter = curies.get_obo_converter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d4e138e4-31f5-4c0d-ba0f-9849586af00c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 6.73 s, sys: 63 ms, total: 6.79 s\n",
+      "Wall time: 6.8 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "bioregistry_converter = curies.get_bioregistry_converter()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f94b0791-ab75-481b-9e83-8990f0fbc4f1",
+   "metadata": {},
+   "source": [
+    "# Disease Ontology SSSOM Demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5301bafc-15eb-45bc-adf6-6281d6da1b3e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([['DOID:8717', 'oboInOwl:hasDbXref', 'NCI:C50706'],\n",
+       "       ['DOID:8717', 'oboInOwl:hasDbXref', 'MESH:D003668'],\n",
+       "       ['DOID:8717', 'oboInOwl:hasDbXref', 'ICD9CM:707.0'],\n",
+       "       ['DOID:8717', 'oboInOwl:hasDbXref',\n",
+       "        'SNOMEDCT_US_2021_09_01:28103007'],\n",
+       "       ['DOID:8717', 'oboInOwl:hasDbXref', 'UMLS_CUI:C0011127']],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "commit = \"faca4fc335f9a61902b9c47a1facd52a0d3d2f8b\"\n",
+    "url = f\"https://raw.githubusercontent.com/mapping-commons/disease-mappings/{commit}/mappings/doid.sssom.tsv\"\n",
+    "df = pystow.ensure_csv(\"tmp\", url=url, read_csv_kwargs=dict(comment=\"#\"))\n",
+    "df.head()[[\"subject_id\", \"predicate_id\", \"object_id\"]].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "53ae14ad-1665-472f-a849-f6e2fa95fde4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "## Summary\n",
+       "\n",
+       "Standardization was not necessary for 2 (0.0%), resulted in 0 updates (0.0%), and 34,522 failures (100.0%)  in column `object_id`. Here's a breakdown of the prefixes that weren't possible to standardize:\n",
+       "\n",
+       "| prefix                 |   count | examples                                                                                                                                                               |\n",
+       "|:-----------------------|--------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n",
+       "| EFO                    |     131 | EFO:0000274, EFO:0001071, EFO:0001075, EFO:0001422, EFO:0004705                                                                                                        |\n",
+       "| GARD                   |    2030 | GARD:2562, GARD:5721, GARD:6291, GARD:7065, GARD:8378                                                                                                                  |\n",
+       "| ICD10CM                |    3666 | ICD10CM:A21.0, ICD10CM:C03, ICD10CM:K72, ICD10CM:K82.4, ICD10CM:N30.0                                                                                                  |\n",
+       "| ICD9CM                 |    2266 | ICD9CM:214.4, ICD9CM:232.4, ICD9CM:377.75, ICD9CM:428.2, ICD9CM:745.6                                                                                                  |\n",
+       "| ICDO                   |     361 | ICDO:8300/0, ICDO:8840/3, ICDO:9442/1, ICDO:9530/0, ICDO:9590/3                                                                                                        |\n",
+       "| KEGG                   |      41 | KEGG:05016, KEGG:05133, KEGG:05142, KEGG:05222, KEGG:05414                                                                                                             |\n",
+       "| MEDDRA                 |      41 | MEDDRA:10001229, MEDDRA:10015487, MEDDRA:10021312, MEDDRA:10059200, MEDDRA:10060740                                                                                    |\n",
+       "| MESH                   |    3847 | MESH:D002128, MESH:D005141, MESH:D009198, MESH:D011040, MESH:D017240                                                                                                   |\n",
+       "| NCI                    |    4788 | NCI:C26913, NCI:C27390, NCI:C27871, NCI:C40284, NCI:C6081                                                                                                              |\n",
+       "| OMIM                   |    5539 | OMIM:209700, OMIM:222300, OMIM:530000, OMIM:613021, OMIM:618224                                                                                                        |\n",
+       "| ORDO                   |    2023 | ORDO:139441, ORDO:2510, ORDO:255229, ORDO:420702, ORDO:48652                                                                                                           |\n",
+       "| SNOMEDCT_US_2020_03_01 |       6 | SNOMEDCT_US_2020_03_01:236818008, SNOMEDCT_US_2020_03_01:778024005, SNOMEDCT_US_2020_03_01:8757006                                                                     |\n",
+       "| SNOMEDCT_US_2020_09_01 |       1 | SNOMEDCT_US_2020_09_01:1112003                                                                                                                                         |\n",
+       "| SNOMEDCT_US_2021_07_31 |      10 | SNOMEDCT_US_2021_07_31:268180007, SNOMEDCT_US_2021_07_31:703536004, SNOMEDCT_US_2021_07_31:721311006, SNOMEDCT_US_2021_07_31:75931002                                  |\n",
+       "| SNOMEDCT_US_2021_09_01 |    5088 | SNOMEDCT_US_2021_09_01:111359004, SNOMEDCT_US_2021_09_01:155748004, SNOMEDCT_US_2021_09_01:238113006, SNOMEDCT_US_2021_09_01:38804009, SNOMEDCT_US_2021_09_01:92585006 |\n",
+       "| UMLS_CUI               |    6890 | UMLS_CUI:C0031347, UMLS_CUI:C0206724, UMLS_CUI:C0276007, UMLS_CUI:C0392492, UMLS_CUI:C1515285                                                                          |\n",
+       "\n",
+       "## Suggestions\n",
+       "\n",
+       "- NCI Suggestion.x7 - ncit\n",
+       "- MESH Suggestion.x7 - mesh\n",
+       "- ICD9CM Suggestion.x7 - icd9cm\n",
+       "- SNOMEDCT_US_2021_09_01 Suggestion.x7 - snomedct\n",
+       "- UMLS_CUI Suggestion.x7 - umls\n",
+       "- ICD10CM Suggestion.x7 - icd10cm\n",
+       "- ORDO Suggestion.x7 - orphanet.ordo\n",
+       "- GARD Suggestion.x7 - gard\n",
+       "- OMIM Suggestion.x7 - omim\n",
+       "- ICDO Suggestion.x7 - icdo\n",
+       "- EFO Suggestion.x7 - efo\n",
+       "- MEDDRA Suggestion.x7 - meddra\n",
+       "- KEGG Suggestion.x7 - kegg\n",
+       "- SNOMEDCT_US_2021_07_31 Suggestion.x7 - snomedct\n",
+       "- SNOMEDCT_US_2020_03_01 Suggestion.x7 - snomedct\n",
+       "- SNOMEDCT_US_2020_09_01 Suggestion.x7 - snomedct\n"
+      ],
+      "text/plain": [
+       "Report(converter=<curies.api.Converter object at 0x136ff2f10>, column='object_id', nones=0, stayed=2, updated=0)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "obo_converter.pd_standardize_curie(df.copy(), column=\"object_id\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "245227da-d4e2-4ede-9844-bd448ef0e54b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<curies.api.Converter at 0x1475df390>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "bioregistry_converter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4e028a67-634a-4b2e-ad16-aca23fc47e28",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "Standardization was successfully applied to all 36,730 CURIEs in column `object_id`."
+      ],
+      "text/plain": [
+       "Report(converter=<curies.api.Converter object at 0x1475df390>, column='object_id', nones=0, stayed=0, updated=36730)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "bioregistry_converter.pd_standardize_curie(df.copy(), column=\"object_id\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4fa4f1f8-e2cc-4230-8a36-2f8eb9d8b93f",
+   "metadata": {},
+   "source": [
+    "# Mixed CURIEs and URIs demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7239d782-e952-40fc-9a0a-5ae0753fdb22",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "## Summary\n",
+       "\n",
+       "Standardization was not necessary for 1 (20.0%), resulted in 1 updates (20.0%), and 2 failures (40.0%)  in column `0`. Here's a breakdown of the prefixes that weren't possible to standardize:\n",
+       "\n",
+       "| prefix      |   count | examples                               |\n",
+       "|:------------|--------:|:---------------------------------------|\n",
+       "| http        |       1 | http://purl.obolibrary.org/obo/CHEBI_2 |\n",
+       "| not_a_curie |       1 | not_a_curie                            |\n",
+       "\n",
+       "## Suggestions\n",
+       "\n",
+       "- http Suggestion.x2\n",
+       "- not_a_curie Suggestion.x3\n"
+      ],
+      "text/plain": [
+       "Report(converter=<curies.api.Converter object at 0x1475df390>, column=0, nones=1, stayed=1, updated=1)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mixed_df = pd.DataFrame(\n",
+    "    [\n",
+    "        (\"chebi:1\",),\n",
+    "        (\"http://purl.obolibrary.org/obo/CHEBI_2\",),\n",
+    "        (\"CHEBI:3\",),\n",
+    "        (\"not_a_curie\",),\n",
+    "        (None,),\n",
+    "    ]\n",
+    ")\n",
+    "bioregistry_converter.pd_standardize_curie(mixed_df, column=0)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/curies/__init__.py b/src/curies/__init__.py
@@ -16,6 +16,7 @@
     load_prefix_map,
 )
 from .reconciliation import remap_curie_prefixes, remap_uri_prefixes, rewire
+from .report import Report
 from .sources import (
     get_bioregistry_converter,
     get_go_converter,
@@ -28,6 +29,7 @@
 __all__ = [
     "Converter",
     "Record",
+    "Report",
     "ReferenceTuple",
     "Reference",
     "DuplicateValueError",