Skip to content

Commit

Permalink
Add validator demo
Browse files Browse the repository at this point in the history
Signed-off-by: zethson <[email protected]>
  • Loading branch information
Zethson committed Oct 9, 2024
1 parent 6c0c9e5 commit a30ad8c
Show file tree
Hide file tree
Showing 5 changed files with 615 additions and 30 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ __pycache__/

Sandbox.ipynb
test.ipynb
*/10ktp__10X__Visium__Mouse__brain__20200623__v1.1.0.zarr
578 changes: 578 additions & 0 deletions scripts/validator_demo.ipynb

Large diffs are not rendered by default.

12 changes: 3 additions & 9 deletions src/spatialdata_db/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
from importlib.metadata import version
from importlib import resources
import pandas as pd
from spatialdata_db.parsing import load_10x_metadata
from spatialdata_db.lamin_spatialdata_validator import SpatialDataValidator


def load_10x_metadata():
with resources.open_text("spatialdata_db.utils.data", "datasets_10x.csv") as file:
return pd.read_csv(file, sep=";")


__all__ = ["load_10x_metadata"]
__all__ = ["load_10x_metadata", "SpatialDataValidator"]
__version__ = version("spatialdata-db")
47 changes: 26 additions & 21 deletions src/spatialdata_db/lamin_spatialdata_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from lamindb_setup.core.types import UPathStr
from lnschema_core import Record
from lnschema_core.types import FieldAttr
from lamin_utils import logger, colors

def _add_defaults(data: pd.DataFrame | UPathStr, defaults: dict[str, str] = None) -> None:
"""Adds default values to a Pandas DataFrame if values are missing."""
Expand All @@ -24,15 +25,15 @@ def _add_defaults(data: pd.DataFrame | UPathStr, defaults: dict[str, str] = None
class SpatialDataMetadataValidator(DataFrameCurator):

DEFAULT_CATEGORICALS = {
"disease": bt.Disease.name,
"assay": bt.ExperimentalFactor.name,
}

DEFAULT_VALUES = {
"disease": "normal",
"assay": "na",
}

FIXED_SOURCES = {
"disease": bt.Source.filter(entity="bionty.Disease", name="mondo", version="2023-04-04").one()
"assay": bt.Source.filter(entity="bionty.ExperimentalFactor", name="efo", version="3.70.0").one()
}

def __init__(
Expand All @@ -45,7 +46,6 @@ def __init__(
organism="human",
):
self.data = data
self.organism = organism

_add_defaults(data, defaults)

Expand All @@ -60,18 +60,19 @@ def validate(self, organism: str | None = None) -> bool:
class SpatialDataTableValidator(AnnDataCurator):

DEFAULT_CATEGORICALS = {
"disease": bt.Disease.name,
"celltype": bt.CellType.name,
}

DEFAULT_VALUES = {
"disease": "normal",
"Celltype": "normal",
}

FIXED_SOURCES = {
"disease": bt.Source.filter(entity="bionty.Disease", name="mondo", version="2023-04-04").one()
"celltype": bt.Source.filter(entity="bionty.CellType", name="cl", version="2023-08-24").one()
}

# TODO not every AnnData objects will have all of these obs columns present but one of them should -> define a rule
# TODO not every AnnData objects will have all of these obs columns present but one of them should
# Figure out how to pass the categoricals to the respective tables

def __init__(
self,
Expand All @@ -80,18 +81,21 @@ def __init__(
categoricals: dict[str, FieldAttr] = DEFAULT_CATEGORICALS,
*,
defaults: dict[str, str] = None,
table_key: str,
organism="human",
):
self.data = data
self.table_key = table_key

_add_defaults(data, defaults)

super().__init__(
data=data, var_index=var_index, categoricals=categoricals, organism=organism
)

def validate(self) -> bool:
"""Further custom validation."""
# --- Custom validation logic goes here --- #
return super().validate()
def validate(self, organism: str | None = None) -> bool:
"""Validate the table."""
return super().validate(organism)


class SpatialDataValidator:
Expand All @@ -111,18 +115,19 @@ def __init__(

# TODO think about how to integrate the parameters -> some weird nested quirky thing

self.metadata_validator = SpatialDataMetadataValidator(self.sdata.metadata, organism=self.organism)
self.table_validators = [SpatialDataTableValidator(table, organism=self.organism) for table in self.sdata.tables]
self.metadata_validator = SpatialDataMetadataValidator(data=self.sdata.metadata, organism=self.organism)
self.table_validators = {table_key: SpatialDataTableValidator(data=sdata.tables[table_key], table_key=table_key, organism=self.organism) for table_key in self.sdata.tables.keys()}


def validate(self, organism: str | None = None) -> bool:
"""Further custom validation."""

"""Validating Spatialdata objects including the metadata and all tables (AnnData objects)."""
# TODO this should very clearly state which things were able to be validate or not

is_metadata_validated = DataFrameCurator.validate(self.sdata.metadata, organism)
is_table_validated = False
for sdtvalidator in self.table_validators:
is_table_validated = AnnDataCurator.validate(sdtvalidator, organism)
logger.info(f"Validating {colors.green('metadata')}.")
is_metadata_validated = self.metadata_validator.validate(organism)
is_tables_validated = False
for table_key, sdtvalidator in self.table_validators.items():
logger.info(f"Validating Anndata object with key {colors.green(sdtvalidator.table_key)}")
is_tables_validated = sdtvalidator.validate(organism)

return is_metadata_validated and is_table_validated
return is_metadata_validated and is_tables_validated
7 changes: 7 additions & 0 deletions src/spatialdata_db/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from importlib import resources
import pandas as pd


def load_10x_metadata():
with resources.open_text("spatialdata_db.utils.data", "datasets_10x.csv") as file:
return pd.read_csv(file, sep=";")

0 comments on commit a30ad8c

Please sign in to comment.