diff --git a/notebooks/Data Science Demo.ipynb b/notebooks/Data Science Demo.ipynb new file mode 100644 index 00000000..7943aed1 --- /dev/null +++ b/notebooks/Data Science Demo.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "cfb169ac-d6e7-4132-9ffc-a14edf8a918f", + "metadata": {}, + "outputs": [], + "source": [ + "import curies\n", + "import pandas as pd\n", + "import itertools as itt\n", + "import pystow" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3091dc17-b60d-4cc1-94a9-c523b3cce4e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 185 ms, sys: 108 ms, total: 293 ms\n", + "Wall time: 917 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "obo_converter = curies.get_obo_converter()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d4e138e4-31f5-4c0d-ba0f-9849586af00c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6.73 s, sys: 63 ms, total: 6.79 s\n", + "Wall time: 6.8 s\n" + ] + } + ], + "source": [ + "%%time\n", + "bioregistry_converter = curies.get_bioregistry_converter()" + ] + }, + { + "cell_type": "markdown", + "id": "f94b0791-ab75-481b-9e83-8990f0fbc4f1", + "metadata": {}, + "source": [ + "# Disease Ontology SSSOM Demo" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5301bafc-15eb-45bc-adf6-6281d6da1b3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([['DOID:8717', 'oboInOwl:hasDbXref', 'NCI:C50706'],\n", + " ['DOID:8717', 'oboInOwl:hasDbXref', 'MESH:D003668'],\n", + " ['DOID:8717', 'oboInOwl:hasDbXref', 'ICD9CM:707.0'],\n", + " ['DOID:8717', 'oboInOwl:hasDbXref',\n", + " 'SNOMEDCT_US_2021_09_01:28103007'],\n", + " ['DOID:8717', 'oboInOwl:hasDbXref', 'UMLS_CUI:C0011127']],\n", + " dtype=object)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commit = \"faca4fc335f9a61902b9c47a1facd52a0d3d2f8b\"\n", + "url = f\"https://raw.githubusercontent.com/mapping-commons/disease-mappings/{commit}/mappings/doid.sssom.tsv\"\n", + "df = pystow.ensure_csv(\"tmp\", url=url, read_csv_kwargs=dict(comment=\"#\"))\n", + "df.head()[[\"subject_id\", \"predicate_id\", \"object_id\"]].values" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "53ae14ad-1665-472f-a849-f6e2fa95fde4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Summary\n", + "\n", + "Standardization was not necessary for 2 (0.0%), resulted in 0 updates (0.0%), and 34,522 failures (100.0%) in column `object_id`. Here's a breakdown of the prefixes that weren't possible to standardize:\n", + "\n", + "| prefix | count | examples |\n", + "|:-----------------------|--------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "| EFO | 131 | EFO:0000274, EFO:0001071, EFO:0001075, EFO:0001422, EFO:0004705 |\n", + "| GARD | 2030 | GARD:2562, GARD:5721, GARD:6291, GARD:7065, GARD:8378 |\n", + "| ICD10CM | 3666 | ICD10CM:A21.0, ICD10CM:C03, ICD10CM:K72, ICD10CM:K82.4, ICD10CM:N30.0 |\n", + "| ICD9CM | 2266 | ICD9CM:214.4, ICD9CM:232.4, ICD9CM:377.75, ICD9CM:428.2, ICD9CM:745.6 |\n", + "| ICDO | 361 | ICDO:8300/0, ICDO:8840/3, ICDO:9442/1, ICDO:9530/0, ICDO:9590/3 |\n", + "| KEGG | 41 | KEGG:05016, KEGG:05133, KEGG:05142, KEGG:05222, KEGG:05414 |\n", + "| MEDDRA | 41 | MEDDRA:10001229, MEDDRA:10015487, MEDDRA:10021312, MEDDRA:10059200, MEDDRA:10060740 |\n", + "| MESH | 3847 | MESH:D002128, MESH:D005141, MESH:D009198, MESH:D011040, MESH:D017240 |\n", + "| NCI | 4788 | NCI:C26913, NCI:C27390, NCI:C27871, NCI:C40284, NCI:C6081 |\n", + "| OMIM | 5539 | OMIM:209700, OMIM:222300, OMIM:530000, OMIM:613021, OMIM:618224 |\n", + "| ORDO | 2023 | ORDO:139441, ORDO:2510, ORDO:255229, ORDO:420702, ORDO:48652 |\n", + "| SNOMEDCT_US_2020_03_01 | 6 | SNOMEDCT_US_2020_03_01:236818008, SNOMEDCT_US_2020_03_01:778024005, SNOMEDCT_US_2020_03_01:8757006 |\n", + "| SNOMEDCT_US_2020_09_01 | 1 | SNOMEDCT_US_2020_09_01:1112003 |\n", + "| SNOMEDCT_US_2021_07_31 | 10 | SNOMEDCT_US_2021_07_31:268180007, SNOMEDCT_US_2021_07_31:703536004, SNOMEDCT_US_2021_07_31:721311006, SNOMEDCT_US_2021_07_31:75931002 |\n", + "| SNOMEDCT_US_2021_09_01 | 5088 | SNOMEDCT_US_2021_09_01:111359004, SNOMEDCT_US_2021_09_01:155748004, SNOMEDCT_US_2021_09_01:238113006, SNOMEDCT_US_2021_09_01:38804009, SNOMEDCT_US_2021_09_01:92585006 |\n", + "| UMLS_CUI | 6890 | UMLS_CUI:C0031347, UMLS_CUI:C0206724, UMLS_CUI:C0276007, UMLS_CUI:C0392492, UMLS_CUI:C1515285 |\n", + "\n", + "## Suggestions\n", + "\n", + "- NCI Suggestion.x7 - ncit\n", + "- MESH Suggestion.x7 - mesh\n", + "- ICD9CM Suggestion.x7 - icd9cm\n", + "- SNOMEDCT_US_2021_09_01 Suggestion.x7 - snomedct\n", + "- UMLS_CUI Suggestion.x7 - umls\n", + "- ICD10CM Suggestion.x7 - icd10cm\n", + "- ORDO Suggestion.x7 - orphanet.ordo\n", + "- GARD Suggestion.x7 - gard\n", + "- OMIM Suggestion.x7 - omim\n", + "- ICDO Suggestion.x7 - icdo\n", + "- EFO Suggestion.x7 - efo\n", + "- MEDDRA Suggestion.x7 - meddra\n", + "- KEGG Suggestion.x7 - kegg\n", + "- SNOMEDCT_US_2021_07_31 Suggestion.x7 - snomedct\n", + "- SNOMEDCT_US_2020_03_01 Suggestion.x7 - snomedct\n", + "- SNOMEDCT_US_2020_09_01 Suggestion.x7 - snomedct\n" + ], + "text/plain": [ + "Report(converter=, column='object_id', nones=0, stayed=2, updated=0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "obo_converter.pd_standardize_curie(df.copy(), column=\"object_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "245227da-d4e2-4ede-9844-bd448ef0e54b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bioregistry_converter" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4e028a67-634a-4b2e-ad16-aca23fc47e28", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "Standardization was successfully applied to all 36,730 CURIEs in column `object_id`." + ], + "text/plain": [ + "Report(converter=, column='object_id', nones=0, stayed=0, updated=36730)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bioregistry_converter.pd_standardize_curie(df.copy(), column=\"object_id\")" + ] + }, + { + "cell_type": "markdown", + "id": "4fa4f1f8-e2cc-4230-8a36-2f8eb9d8b93f", + "metadata": {}, + "source": [ + "# Mixed CURIEs and URIs demo" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7239d782-e952-40fc-9a0a-5ae0753fdb22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Summary\n", + "\n", + "Standardization was not necessary for 1 (20.0%), resulted in 1 updates (20.0%), and 2 failures (40.0%) in column `0`. Here's a breakdown of the prefixes that weren't possible to standardize:\n", + "\n", + "| prefix | count | examples |\n", + "|:------------|--------:|:---------------------------------------|\n", + "| http | 1 | http://purl.obolibrary.org/obo/CHEBI_2 |\n", + "| not_a_curie | 1 | not_a_curie |\n", + "\n", + "## Suggestions\n", + "\n", + "- http Suggestion.x2\n", + "- not_a_curie Suggestion.x3\n" + ], + "text/plain": [ + "Report(converter=, column=0, nones=1, stayed=1, updated=1)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mixed_df = pd.DataFrame(\n", + " [\n", + " (\"chebi:1\",),\n", + " (\"http://purl.obolibrary.org/obo/CHEBI_2\",),\n", + " (\"CHEBI:3\",),\n", + " (\"not_a_curie\",),\n", + " (None,),\n", + " ]\n", + ")\n", + "bioregistry_converter.pd_standardize_curie(mixed_df, column=0)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/curies/__init__.py b/src/curies/__init__.py index 1145ae36..d5b918fc 100644 --- a/src/curies/__init__.py +++ b/src/curies/__init__.py @@ -16,6 +16,7 @@ load_prefix_map, ) from .reconciliation import remap_curie_prefixes, remap_uri_prefixes, rewire +from .report import Report from .sources import ( get_bioregistry_converter, get_go_converter, @@ -28,6 +29,7 @@ __all__ = [ "Converter", "Record", + "Report", "ReferenceTuple", "Reference", "DuplicateValueError", diff --git a/src/curies/api.py b/src/curies/api.py index c8d41802..c2a7b671 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -5,7 +5,7 @@ import csv import itertools as itt import json -from collections import defaultdict +from collections import Counter, defaultdict from functools import partial from pathlib import Path from typing import ( @@ -21,7 +21,6 @@ Mapping, NamedTuple, Optional, - Sequence, Set, Tuple, TypeVar, @@ -38,6 +37,9 @@ import pandas import rdflib + import curies + + __all__ = [ "Converter", "Reference", @@ -1333,6 +1335,7 @@ def pd_compress( def pd_expand( self, df: "pandas.DataFrame", + *, column: Union[str, int], target_column: Union[None, str, int] = None, strict: bool = False, @@ -1379,7 +1382,7 @@ def pd_standardize_curie( target_column: Union[None, str, int] = None, strict: bool = False, passthrough: bool = False, - ) -> None: + ) -> "curies.Report": r"""Standardize all CURIEs in the given column. :param df: A pandas DataFrame @@ -1388,6 +1391,8 @@ def pd_standardize_curie( :param strict: If true and any CURIE can't be standardized, returns an error. Defaults to false. :param passthrough: If true, strict is false, and any CURIE can't be standardized, return the input. Defaults to false. + :return: A report object + :raises ValueError: If strict is enabled and the column contains CURIEs that aren't standardizable The Disease Ontology curates mappings to other semantic spaces and distributes them in the tabular SSSOM format. However, they use a wide variety of non-standard prefixes for referring @@ -1404,8 +1409,50 @@ def pd_standardize_curie( >>> converter = curies.get_bioregistry_converter() >>> converter.pd_standardize_curie(df, column="object_id") """ - func = partial(self.standardize_curie, strict=strict, passthrough=passthrough) - df[column if target_column is None else target_column] = df[column].map(func) + import pandas as pd + + from .report import Report + + norm_curies: List[Optional[str]] = [] + failures: DefaultDict[str, Counter[str]] = defaultdict(Counter) + stayed = 0 + updated = 0 + nones = 0 + invalid = 0 + for curie in df[column]: + if pd.isna(curie): + nones += 1 + norm_curies.append(None) + continue + try: + norm_curie = self.standardize_curie(curie) + except ValueError: + # happens on an invalid curie, i.e., without a : + invalid += 1 + norm_curie = None + if norm_curie is None: + failures[curie.split(":")[0]][curie] += 1 + elif curie == norm_curie: + stayed += 1 + else: + updated += 1 + norm_curies.append(norm_curie) + report = Report( + converter=self, + failures=failures, + nones=nones, + stayed=stayed, + updated=updated, + column=column, + ) + if strict and failures: + raise ValueError( + f"Some CURIEs couldn't be standardized and strict mode is enabled. Either set " + f"`strict=False`, and entries that can't be parsed will be given `None`, or try " + f"and improve your context to better cover your data. Here's the report:\n\n{report.get_markdown()}" + ) + df[column if target_column is None else target_column] = norm_curies + return report def pd_standardize_uri( self, @@ -1565,7 +1612,7 @@ def _in(a: str, bs: Iterable[str], case_sensitive: bool) -> bool: return any(nfa == b.casefold() for b in bs) -def chain(converters: Sequence[Converter], *, case_sensitive: bool = True) -> Converter: +def chain(converters: Iterable[Converter], *, case_sensitive: bool = True) -> Converter: """Chain several converters. :param converters: A list or tuple of converters diff --git a/src/curies/report.py b/src/curies/report.py new file mode 100644 index 00000000..d78cba34 --- /dev/null +++ b/src/curies/report.py @@ -0,0 +1,175 @@ +"""Report.""" + +import dataclasses +import enum +import random +import typing +from collections import Counter, defaultdict +from typing import TYPE_CHECKING, Dict, Mapping, Optional, Tuple + +from .api import Converter + +if TYPE_CHECKING: + import pandas + +__all__ = [ + "Report", +] + + +def _list(correct: typing.Sequence[str]) -> str: + if len(correct) == 1: + return f"`{correct[0]}`" + if len(correct) == 2: + return f"`{correct[0]}` or `{correct[1]}`" + x = ", ".join(f"`{v}`" for v in correct[:-1]) + return f"{x}, or `{correct[-1]}`" + + +class Suggestion(enum.Enum): + """""" + + x1 = "means data is encoded using URNs, which isn't explicitly handled by this package." + x2 = "entries are not CURIEs, try and compressing your data first." + x3 = "is not a valid CURIE" + x4 = "has a double prefix annotation" + x5 = "is a case/punctuation variant" + x6 = "is an incorrect way of encoding a URI" + x7 = ( + f"appears in Bioregistry under. Consider chaining your converter with the Bioregistry using " + "[`curies.chain()`](https://curies.readthedocs.io/en/latest/api/curies.chain.html)." + ) + xx = ( + "can either be added to the converter if it is local to the project, " + "or if it is globally useful, contributed to the Bioregistry" + ) + + +@dataclasses.dataclass +class Report: + """A report on CURIEs standardization.""" + + converter: "Converter" + column: str | int + nones: int + stayed: int + updated: int + failures: Mapping[str, typing.Counter[str]] = dataclasses.field(repr=False) + + def count_prefixes(self) -> typing.Counter[str]: + """Count the frequency of each failing prefix.""" + return Counter({prefix: len(counter) for prefix, counter in self.failures.items()}) + + def get_df(self) -> "pandas.DataFrame": + """Summarize standardization issues in a dataframe.""" + import pandas as pd + + rows = [ + ( + prefix, + sum(counter.values()), + ", ".join(sorted(set(random.choices(list(counter), k=5)))), # noqa:S311 + ) + for prefix, counter in sorted(self.failures.items(), key=lambda p: p[0].casefold()) + ] + return pd.DataFrame(rows, columns=["prefix", "count", "examples"]) + + def get_suggestions(self) -> Dict[str, Tuple[Suggestion, Optional[str]]]: + """Get a mapping from missing prefix to suggestion text.""" + try: + import bioregistry + except ImportError: + bioregistry = None + + norm_to_prefix = defaultdict(set) + + def _norm(s: str) -> str: + for x in "_.- ": + s = s.replace(x, "") + return s.casefold() + + for record in self.converter.records: + for p in record._all_prefixes: + norm_to_prefix[_norm(p)].add(p) + + rv: dict[str, tuple[Suggestion, str | None]] = {} + for prefix, c in self.failures.items(): + if prefix in {"url", "uri", "iri"}: + rv[prefix] = Suggestion.x6, None + continue + if prefix in {"urn"}: + rv[prefix] = Suggestion.x1, None + continue + if prefix in {"http", "https", "ftp"}: + rv[prefix] = Suggestion.x2, None + continue + if len(c) == 1: + first = list(c)[0] + if first == prefix: + rv[prefix] = Suggestion.x3, None + continue + elif first.lower() == f"{prefix.lower()}:{prefix.lower()}": + rv[prefix] = Suggestion.x4, prefix.lower() + continue + correct = sorted(norm_to_prefix.get(_norm(prefix), [])) + if correct: + rv[prefix] = Suggestion.x5, _list(correct) + continue + + if bioregistry is not None: + norm_prefix = bioregistry.normalize_prefix(prefix) + if norm_prefix: + rv[prefix] = Suggestion.x7, norm_prefix + continue + + # TODO check for bananas? + rv[prefix] = Suggestion.xx, None + return rv + + def get_markdown(self) -> str: + """Get markdown text.""" + try: + import bioregistry + except ImportError: + bioregistry = None + + failures = sum(len(c) for c in self.failures.values()) + total = self.nones + self.stayed + self.updated + failures + df = self.get_df() + + # TODO write # CURIEs, # unique CURIEs, and # unique prefixes + text = "## Summary\n\n" + if 0 == len(df.index): + if not self.stayed: + return ( + f"Standardization was successfully applied to all " + f"{self.updated:,} CURIEs in column `{self.column}`." + ) + return ( + f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}) CURIEs " + f"and resulted in updates for {self.updated:,} ({self.updated/total:.1%}) CURIEs " + f"in column `{self.column}`" + ) + + if bioregistry is None: + text += "\nInstall the Bioregistry with `pip install bioregistry` for more detailed suggestions\n\n" + text += ( + f"Standardization was not necessary for {self.stayed:,} ({self.stayed/total:.1%}), " + f"resulted in {self.updated:,} updates ({self.updated/total:.1%}), and {failures:,} failures " + f"({failures/total:.1%}) in column `{self.column}`. Here's a breakdown of the prefixes that " + f"weren't possible to standardize:\n\n" + ) + text += df.to_markdown(index=False) + + suggestions = self.get_suggestions() + if suggestions: + text += "\n\n## Suggestions\n\n" + for prefix, (suggestion, extra) in suggestions.items(): + text += f"- {prefix} {suggestion}" + if extra: + text += f" - {extra}" + text += "\n" + return text + + def _repr_markdown_(self) -> str: + return self.get_markdown() diff --git a/src/curies/sources.py b/src/curies/sources.py index 13195360..190b7008 100644 --- a/src/curies/sources.py +++ b/src/curies/sources.py @@ -4,7 +4,7 @@ from typing import Any -from .api import Converter +from .api import Converter, Record __all__ = [ "get_obo_converter", @@ -61,6 +61,19 @@ def get_bioregistry_converter(web: bool = False, **kwargs: Any) -> Converter: pass else: epm = bioregistry.manager.get_curies_records() # pragma: no cover + for record in epm: # pragma: no cover + # Remove this after https://github.com/biopragmatics/bioregistry/issues/935 is fixed + _augment_curie_prefix_synonyms(record) # pragma: no cover return Converter.from_extended_prefix_map(epm) # pragma: no cover url = f"{BIOREGISTRY_CONTEXTS}/bioregistry.epm.json" return Converter.from_extended_prefix_map(url, **kwargs) + + +def _augment_curie_prefix_synonyms(record: Record) -> None: + new_prefix_synonyms = set() + for s in record._all_prefixes: + new_prefix_synonyms.add(s) + new_prefix_synonyms.add(s.lower()) + new_prefix_synonyms.add(s.upper()) + new_prefix_synonyms.difference_update(record.prefix) + record.prefix_synonyms = sorted(new_prefix_synonyms) diff --git a/tests/test_data_science.py b/tests/test_data_science.py new file mode 100644 index 00000000..73887faa --- /dev/null +++ b/tests/test_data_science.py @@ -0,0 +1,26 @@ +"""Tests for data science utilities.""" + +import unittest + +import pandas as pd + +import curies + + +class TestDataScience(unittest.TestCase): + """Test case for data science utilities.""" + + def test_case_mismatch(self): + """Test case mismatch on CURIE standardizations.""" + data = ["EFO:1", "nope:nope"] + df = pd.DataFrame([(row,) for row in data], columns=["curie"]) + + converter = curies.Converter.from_prefix_map({"efo": "https://identifiers.org/efo:"}) + with self.assertRaises(ValueError): + converter.pd_standardize_curie(df, column="curie", strict=True) + + results = converter.pd_standardize_curie(df, column="curie") + suggestions = results.get_suggestions() + self.assertIsInstance(suggestions, dict) + self.assertIn("", suggestions) + # FIXME add more detailed tests