Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metric morans i #303

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions scib/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .isolated_labels import isolated_labels
from .kbet import kBET
from .lisi import clisi_graph, ilisi_graph
from .morans_i import morans_i
from .nmi import nmi
from .pcr import pcr_comparison
from .silhouette import silhouette, silhouette_batch
Expand Down Expand Up @@ -193,6 +194,7 @@ def metrics(
n_isolated=None,
graph_conn_=False,
trajectory_=False,
moransi_=False,
kBET_=False,
lisi_graph_=False,
ilisi_=False,
Expand Down Expand Up @@ -465,6 +467,12 @@ def metrics(
else:
trajectory_score = np.nan

if moransi_:
print("Moran's I score...")
moransi_score = morans_i(adata, adata_int, batch_key=batch_key)
else:
moransi_score = np.nan

results = {
"NMI_cluster/label": nmi_score,
"ARI_cluster/label": ari_score,
Expand All @@ -480,6 +488,7 @@ def metrics(
"cLISI": clisi,
"hvg_overlap": hvg_score,
"trajectory": trajectory_score,
"moransi": moransi_score,
}

return pd.DataFrame.from_dict(results, orient="index")
151 changes: 151 additions & 0 deletions scib/metrics/morans_i.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import numpy as np
import pandas as pd
import scanpy as sc

from scib.preprocessing import hvg_batch


def morans_i(
adata_pre,
adata_post,
batch_key,
n_hvg=1000,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1000 hvgs sounds like a lot for this as default. Is there any reason for this? Can it be reduced?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, this can be reduced - maybe to 100?

embed="X_pca",
rescale=True,
compare_pre=False,
hvgs=None,
):
"""Moran's I

Compute Moran's I on HVGs (defined across batches) on integrated data as a measure of bio conservation.
The metric can be computed either as

1. mean difference between Morans's I on embedding of integrated data and max Moran's I of individual non-integrated batches or
2. mean Morans's I on embedding of integrated data.

:param adata_pre: Non-integrated data.
Embedding is computed on scaled (m=0, s=1) X (expression)
per batch (recomputing HVGs, scaling, pca, neighbours).
For Moran's I unscaled X is used.
:param adata_post: Integrate data. Should have precomputed embedding and expression in X.
:param batch_key: Batch obs col in adata_pre.
:param n_hvg: How many top HVGs to use for Moran's I computation.
:param embed: Embedding to use for computation of neighbours in adata_post
:param rescale: Whether to rescale result to [0,1] so that 1 is better bio conservation
:param compare_pre: Whether to compute metric under scenario A instead of B.
:param hvgs: Custom list of genes to use for Moran's I computation (instead of hvgs
computed across batches).
:return: The resulting metric.
"""
# Prepare adatas
# Get integrated connectivities for Moran's I
adata_post = adata_post.copy()
sc.pp.neighbors(adata_post, use_rep=embed)
# Prepare pre data
adata_pre = adata_pre.copy()
adata_pre.obs[batch_key] = adata_pre.obs[batch_key].astype("category")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a call to check_sanity() or the check_adata or check_batch util functions from here:

def check_sanity(adata, batch, hvg):


# Get HVGs across samples
if hvgs is None:
hvgs = hvg_batch(
adata_pre, batch_key=batch_key, target_genes=n_hvg, flavor="cell_ranger"
)
else:
assert adata_pre.var_names.isin(hvgs).sum() == len(hvgs)

def compute_mi(data, hvgs):
"""
Helper function for computing Moran's I on HVGS that are non-constant
and adding them to var
:param data: adata, result is added to var
:param hvgs: Genes for which to compute Moran's I
"""
# Make sure that genes used for computation are non-constant
cond = (
np.array(np.mean(data[:, hvgs].X, axis=0))
!= np.array(data[0, hvgs].X.todense())
).squeeze()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mean equal to first value could happen if the genes are not constant, no? Why not define this via standard deviation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right! std is better!

hvgs_used = np.array(hvgs)[cond]
if len(hvgs) > len(hvgs_used):
print(
"Using subset (%i) of hvgs (%i) that are not constant in the data"
% (len(hvgs_used), len(hvgs))
)
# Compute Moran's I
morans_i = sc.metrics.morans_i(data[:, hvgs_used])
morans_i = pd.Series(morans_i, index=hvgs_used)
# This should not happen, just in case
if (morans_i > 1).any() or (morans_i < -1).any():
raise ValueError(
"Problems in computing Moran's I, value not between -1 and 1"
)
data.var["morans_i"] = morans_i

if compare_pre:
# Get per sample connectivities
adatas_sample = dict()
for sample in adata_pre.obs[batch_key].unique():
# Subset only to cells from 1 sample
adata_sample = adata_pre[adata_pre.obs[batch_key] == sample, :].copy()
# Compute HVG for each sample
sc.pp.highly_variable_genes(
adata_sample,
flavor="cell_ranger",
batch_key="study_sample",
n_top_genes=2000,
)
adata_sample_scl = adata_sample.copy()
# PP each sample: scaling, pca, neighbours
sc.pp.scale(adata_sample_scl, max_value=10)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just out of curiosity: any reason for scaling here specifically or out of habit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of the code was adapted from Karin. @Hrovatin, do you have an opinion here?

sc.pp.pca(
adata_sample_scl,
n_comps=15,
use_highly_variable=True,
svd_solver="arpack",
)
sc.pp.neighbors(adata_sample_scl, n_pcs=15)
# Add computed info back to unscaled data
adata_sample.uns["neighbors"] = adata_sample_scl.uns["neighbors"]
adata_sample.obsp["connectivities"] = adata_sample_scl.obsp[
"connectivities"
]
adata_sample.obsp["distances"] = adata_sample_scl.obsp["distances"]
adata_sample.uns["pca"] = adata_sample_scl.uns["pca"]
adata_sample.obsm["X_pca"] = adata_sample_scl.obsm["X_pca"]
# Save adata of sample
adatas_sample[sample] = adata_sample
del adata_sample
del adata_sample_scl

# Compute Moran's I across samples and on integrated data
for data in list(adatas_sample.values()) + [adata_post]:
compute_mi(data, hvgs)

# Compute differences in Moran's I
# Table of Moran's I-s across batches
batch_mis = []
for sample, data in adatas_sample.items():
mi = data.var["morans_i"]
mi.name = sample
batch_mis.append(mi)
batch_mis = pd.concat(batch_mis, axis=1)
# Difference with integrated data
mi_diffs = adata_post.var["morans_i"] - batch_mis.max(axis=1)
avg_mi_diff = mi_diffs.mean()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't entirely understand this part. You are expecting optimal integration to mean that Moran's I is the same same in the full integration as the max in any individual batch? Is the max used due to the possibility of having unique cell types in a single batch? Is this affected by cell type composition differences across batches?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the max gives an upper value on the degree at which an individual gene can be clustered. I changed it in #304 such that only worse morans I scores are counted - if you agree that the max is an upper bound per gene, then a difference > 0 would mean that the integration was successful in maintaining this trait of the data.

To compare different integrations, I believe that that compare_pre=False computation would be sensible as scores are directly comparable (their mean).


# Rescale so that it is between [0,1] where 1 is better
# Moran's I will be in [-1,1] and thus difference can be in [-2,2]
if rescale:
res = (avg_mi_diff + 2) / 4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another note on rescaling. Here it seems that a difference of 2 (perfect auto-correlation of markers in the integrated data but max per-batch auto-correlation is -1) is optimal. I'm not sure this should be the case. Wouldn't the idea be to detect the same spatial correlation structure in a single batch as across all of them? I guess you want a 1 - (avg_mi_diff + 2) / 4 here, no?

Also, should a Moran's I of -1 and 0 be distinguished for the comparison at all? Both mean no biologically meaningful information is encoded, no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-1 means perfectly dispersed, if we are looking for a clustered signal we could do max(0, score) to have the score in [0,1] from the beginning - what do you think?

Wrt to the 1 - ... , I totally agree. Minimal distances should yield a high score.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EDIT: I changed it such as to only consider "bad" differences, please check in #304

else:
res = avg_mi_diff

else:
# Moran's I on integrated data
compute_mi(adata_post, hvgs)
res = adata_post.var["morans_i"].mean()
if rescale:
# Moran's I will be in [-1,1]
res = (res + 1) / 2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also here: should a Moran's I of 0 be better than one of -1? Could that difference only be due to sparsity of the marker?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed it to scores >0 as we are mostly interested in clusterings.


return res
8 changes: 7 additions & 1 deletion scib/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import anndata2ri
import numpy as np
import rpy2.rinterface_lib.callbacks # rpy2 for running R code
import rpy2.rinterface_lib.callbacks
import rpy2.rinterface_lib.embedded
import rpy2.robjects as ro
import scanpy as sc
Expand Down Expand Up @@ -506,6 +506,12 @@ def hvg_batch(
hvg = nbatch1_dispersions.index[:]
not_n_batches = 1

# Check that target_genes is not greater than total number of genes
if not target_genes <= adata_hvg.n_vars:
raise ValueError(
f"Number of HVGs ({target_genes=}) has to be smaller than total number of genes ({adata.n_vars=})"
)

while not enough:
target_genes_diff = target_genes - len(hvg)

Expand Down
39 changes: 39 additions & 0 deletions tests/metrics/test_morans_i.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import scib
from tests.common import LOGGER, assert_near_exact


def test_morans_i_nhvg(adata_neighbors):
adata_int = adata_neighbors.copy()
score = scib.me.morans_i(
adata_pre=adata_neighbors,
adata_post=adata_int,
batch_key="batch",
n_hvg=100,
)
LOGGER.info(f"score: {score}")
expected_score = 0.65204
assert_near_exact(score, expected_score, diff=1e-5)


def test_morans_i_hvgs(adata_neighbors):
adata_int = adata_neighbors.copy()
score = scib.me.morans_i(
adata_pre=adata_neighbors,
adata_post=adata_int,
batch_key="batch",
hvgs=[
"TNFRSF4",
"TNFRSF1B",
"EFHD2",
"C1QA",
"C1QB",
"STMN1",
"SMAP2",
"PRDX1",
"SCP2",
"C1orf162",
],
)
LOGGER.info(f"score: {score}")
expected_score = 0.64337
assert_near_exact(score, expected_score, diff=1e-5)