Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SpatialDataValidator #36

Merged
merged 4 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ __pycache__/

Sandbox.ipynb
test.ipynb
*/10ktp__10X__Visium__Mouse__brain__20200623__v1.1.0.zarr
578 changes: 578 additions & 0 deletions scripts/validator_demo.ipynb

Large diffs are not rendered by default.

19 changes: 9 additions & 10 deletions src/spatialdata_db/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from importlib.metadata import version
from importlib import resources
import pandas as pd


def load_10x_metadata():
with resources.open_text("spatialdata_db.utils.data", "datasets_10x.csv") as file:
return pd.read_csv(file, sep=";")


__all__ = ["load_10x_metadata"]
from spatialdata_db.parsing import load_10x_metadata
from django.core.exceptions import ImproperlyConfigured
from lamin_utils import logger
try:
from spatialdata_db.lamin_spatialdata_validator import SpatialDataValidator
except ImproperlyConfigured:
logger.warning("Importing SpatialDataValidator currently requires being connected to a lamindb instance.")

__all__ = ["load_10x_metadata", "SpatialDataValidator"]
__version__ = version("spatialdata-db")
133 changes: 133 additions & 0 deletions src/spatialdata_db/lamin_spatialdata_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import bionty as bt
import pandas as pd
import anndata as ad
import spatialdata as sd

from lamindb.core import AnnDataCurator, DataFrameCurator
from lamindb_setup.core.types import UPathStr
from lnschema_core import Record
from lnschema_core.types import FieldAttr
from lamin_utils import logger, colors

def _add_defaults(data: pd.DataFrame | UPathStr, defaults: dict[str, str] = None) -> None:
"""Adds default values to a Pandas DataFrame if values are missing."""
if defaults:
if isinstance(data, UPathStr):
data = pd.read_csv(UPathStr) # TODO this parsing is not very safe

for col, default in defaults.items():
if col not in data.columns:
data[col] = default
else:
data[col].fillna(default, inplace=True)


class SpatialDataMetadataValidator(DataFrameCurator):

DEFAULT_CATEGORICALS = {
"assay": bt.ExperimentalFactor.name,
}

DEFAULT_VALUES = {
"assay": "na",
}

FIXED_SOURCES = {
"assay": bt.Source.filter(entity="bionty.ExperimentalFactor", name="efo", version="3.70.0").one()
}

def __init__(
self,
data: pd.DataFrame | UPathStr,
categoricals: dict[str, FieldAttr] = DEFAULT_CATEGORICALS,
*,
defaults: dict[str, str] = DEFAULT_VALUES,
sources: dict[str, Record] = FIXED_SOURCES,
organism="human",
):
self.data = data

_add_defaults(data, defaults)

super().__init__(
df=data, categoricals=categoricals, sources=sources, organism=organism
)

def validate(self, organism: str | None = None) -> bool:
"""Validate the global SpatialDataMetadata."""
return DataFrameCurator.validate(self, organism)

class SpatialDataTableValidator(AnnDataCurator):

DEFAULT_CATEGORICALS = {
"celltype": bt.CellType.name,
}

DEFAULT_VALUES = {
"celltype": "normal",
}

DEFAULT_SOURCES = {
"celltype": bt.Source.filter(entity="bionty.CellType", name="cl", version="2023-08-24").one()
}

# TODO not every AnnData objects will have all of these obs columns present but one of them should
# Figure out how to pass the categoricals to the respective tables

def __init__(
self,
data: ad.AnnData | UPathStr,
var_index: FieldAttr = bt.Gene.ensembl_gene_id,
categoricals: dict[str, FieldAttr] = DEFAULT_CATEGORICALS,
*,
defaults: dict[str, str] = DEFAULT_VALUES,
table_key: str,
organism="human",
):
self.data = data
self.table_key = table_key

_add_defaults(data.obs, defaults)

super().__init__(
data=data, var_index=var_index, categoricals=categoricals, organism=organism
)

def validate(self, organism: str | None = None) -> bool:
"""Validate the table."""
return super().validate(organism)


class SpatialDataValidator:
"""Custom curation flow for SpatialData."""

def __init__(
self,
sdata: sd.SpatialData | UPathStr,
# categoricals: dict[str, FieldAttr] = DEFAULT_CATEGORICALS,
*,
# defaults: dict[str, str] = None,
# sources: dict[str, Record] = FIXED_SOURCES,
organism="human",
):
self.sdata = sdata
self.organism = organism

# TODO think about how to integrate the parameters -> some weird nested quirky thing

self.metadata_validator = SpatialDataMetadataValidator(data=self.sdata.metadata, organism=self.organism)
self.table_validators = {table_key: SpatialDataTableValidator(data=sdata.tables[table_key], table_key=table_key, organism=self.organism) for table_key in self.sdata.tables.keys()}


def validate(self, organism: str | None = None) -> bool:
"""Validating Spatialdata objects including the metadata and all tables (AnnData objects)."""
# TODO this should very clearly state which things were able to be validate or not

logger.info(f"Validating {colors.green('metadata')}.")
is_metadata_validated = self.metadata_validator.validate(organism)
is_tables_validated = False
for table_key, sdtvalidator in self.table_validators.items():
logger.info(f"Validating Anndata object with key {colors.green(table_key)}")
is_tables_validated = sdtvalidator.validate(organism)

return is_metadata_validated and is_tables_validated
7 changes: 7 additions & 0 deletions src/spatialdata_db/parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from importlib import resources
import pandas as pd


def load_10x_metadata():
with resources.open_text("spatialdata_db.utils.data", "datasets_10x.csv") as file:
return pd.read_csv(file, sep=";")
Loading