-
Notifications
You must be signed in to change notification settings - Fork 63
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add metric morans i #303
base: main
Are you sure you want to change the base?
Add metric morans i #303
Changes from all commits
97e5a73
ed18a80
3fb2ea9
5dd45fd
decb65d
f35b0c9
88ba08f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -0,0 +1,151 @@ | ||||
import numpy as np | ||||
import pandas as pd | ||||
import scanpy as sc | ||||
|
||||
from scib.preprocessing import hvg_batch | ||||
|
||||
|
||||
def morans_i( | ||||
adata_pre, | ||||
adata_post, | ||||
batch_key, | ||||
n_hvg=1000, | ||||
embed="X_pca", | ||||
rescale=True, | ||||
compare_pre=False, | ||||
hvgs=None, | ||||
): | ||||
"""Moran's I | ||||
|
||||
Compute Moran's I on HVGs (defined across batches) on integrated data as a measure of bio conservation. | ||||
The metric can be computed either as | ||||
|
||||
1. mean difference between Morans's I on embedding of integrated data and max Moran's I of individual non-integrated batches or | ||||
2. mean Morans's I on embedding of integrated data. | ||||
|
||||
:param adata_pre: Non-integrated data. | ||||
Embedding is computed on scaled (m=0, s=1) X (expression) | ||||
per batch (recomputing HVGs, scaling, pca, neighbours). | ||||
For Moran's I unscaled X is used. | ||||
:param adata_post: Integrate data. Should have precomputed embedding and expression in X. | ||||
:param batch_key: Batch obs col in adata_pre. | ||||
:param n_hvg: How many top HVGs to use for Moran's I computation. | ||||
:param embed: Embedding to use for computation of neighbours in adata_post | ||||
:param rescale: Whether to rescale result to [0,1] so that 1 is better bio conservation | ||||
:param compare_pre: Whether to compute metric under scenario A instead of B. | ||||
:param hvgs: Custom list of genes to use for Moran's I computation (instead of hvgs | ||||
computed across batches). | ||||
:return: The resulting metric. | ||||
""" | ||||
# Prepare adatas | ||||
# Get integrated connectivities for Moran's I | ||||
adata_post = adata_post.copy() | ||||
sc.pp.neighbors(adata_post, use_rep=embed) | ||||
# Prepare pre data | ||||
adata_pre = adata_pre.copy() | ||||
adata_pre.obs[batch_key] = adata_pre.obs[batch_key].astype("category") | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a call to Line 25 in f35b0c9
|
||||
|
||||
# Get HVGs across samples | ||||
if hvgs is None: | ||||
hvgs = hvg_batch( | ||||
adata_pre, batch_key=batch_key, target_genes=n_hvg, flavor="cell_ranger" | ||||
) | ||||
else: | ||||
assert adata_pre.var_names.isin(hvgs).sum() == len(hvgs) | ||||
|
||||
def compute_mi(data, hvgs): | ||||
""" | ||||
Helper function for computing Moran's I on HVGS that are non-constant | ||||
and adding them to var | ||||
:param data: adata, result is added to var | ||||
:param hvgs: Genes for which to compute Moran's I | ||||
""" | ||||
# Make sure that genes used for computation are non-constant | ||||
cond = ( | ||||
np.array(np.mean(data[:, hvgs].X, axis=0)) | ||||
!= np.array(data[0, hvgs].X.todense()) | ||||
).squeeze() | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mean equal to first value could happen if the genes are not constant, no? Why not define this via standard deviation? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right! std is better! |
||||
hvgs_used = np.array(hvgs)[cond] | ||||
if len(hvgs) > len(hvgs_used): | ||||
print( | ||||
"Using subset (%i) of hvgs (%i) that are not constant in the data" | ||||
% (len(hvgs_used), len(hvgs)) | ||||
) | ||||
# Compute Moran's I | ||||
morans_i = sc.metrics.morans_i(data[:, hvgs_used]) | ||||
morans_i = pd.Series(morans_i, index=hvgs_used) | ||||
# This should not happen, just in case | ||||
if (morans_i > 1).any() or (morans_i < -1).any(): | ||||
raise ValueError( | ||||
"Problems in computing Moran's I, value not between -1 and 1" | ||||
) | ||||
data.var["morans_i"] = morans_i | ||||
|
||||
if compare_pre: | ||||
# Get per sample connectivities | ||||
adatas_sample = dict() | ||||
for sample in adata_pre.obs[batch_key].unique(): | ||||
# Subset only to cells from 1 sample | ||||
adata_sample = adata_pre[adata_pre.obs[batch_key] == sample, :].copy() | ||||
# Compute HVG for each sample | ||||
sc.pp.highly_variable_genes( | ||||
adata_sample, | ||||
flavor="cell_ranger", | ||||
batch_key="study_sample", | ||||
n_top_genes=2000, | ||||
) | ||||
adata_sample_scl = adata_sample.copy() | ||||
# PP each sample: scaling, pca, neighbours | ||||
sc.pp.scale(adata_sample_scl, max_value=10) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just out of curiosity: any reason for scaling here specifically or out of habit? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Most of the code was adapted from Karin. @Hrovatin, do you have an opinion here? |
||||
sc.pp.pca( | ||||
adata_sample_scl, | ||||
n_comps=15, | ||||
use_highly_variable=True, | ||||
svd_solver="arpack", | ||||
) | ||||
sc.pp.neighbors(adata_sample_scl, n_pcs=15) | ||||
# Add computed info back to unscaled data | ||||
adata_sample.uns["neighbors"] = adata_sample_scl.uns["neighbors"] | ||||
adata_sample.obsp["connectivities"] = adata_sample_scl.obsp[ | ||||
"connectivities" | ||||
] | ||||
adata_sample.obsp["distances"] = adata_sample_scl.obsp["distances"] | ||||
adata_sample.uns["pca"] = adata_sample_scl.uns["pca"] | ||||
adata_sample.obsm["X_pca"] = adata_sample_scl.obsm["X_pca"] | ||||
# Save adata of sample | ||||
adatas_sample[sample] = adata_sample | ||||
del adata_sample | ||||
del adata_sample_scl | ||||
|
||||
# Compute Moran's I across samples and on integrated data | ||||
for data in list(adatas_sample.values()) + [adata_post]: | ||||
compute_mi(data, hvgs) | ||||
|
||||
# Compute differences in Moran's I | ||||
# Table of Moran's I-s across batches | ||||
batch_mis = [] | ||||
for sample, data in adatas_sample.items(): | ||||
mi = data.var["morans_i"] | ||||
mi.name = sample | ||||
batch_mis.append(mi) | ||||
batch_mis = pd.concat(batch_mis, axis=1) | ||||
# Difference with integrated data | ||||
mi_diffs = adata_post.var["morans_i"] - batch_mis.max(axis=1) | ||||
avg_mi_diff = mi_diffs.mean() | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't entirely understand this part. You are expecting optimal integration to mean that Moran's I is the same same in the full integration as the max in any individual batch? Is the max used due to the possibility of having unique cell types in a single batch? Is this affected by cell type composition differences across batches? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess the max gives an upper value on the degree at which an individual gene can be clustered. I changed it in #304 such that only worse morans I scores are counted - if you agree that the max is an upper bound per gene, then a difference > 0 would mean that the integration was successful in maintaining this trait of the data. To compare different integrations, I believe that that |
||||
|
||||
# Rescale so that it is between [0,1] where 1 is better | ||||
# Moran's I will be in [-1,1] and thus difference can be in [-2,2] | ||||
if rescale: | ||||
res = (avg_mi_diff + 2) / 4 | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another note on rescaling. Here it seems that a difference of 2 (perfect auto-correlation of markers in the integrated data but max per-batch auto-correlation is -1) is optimal. I'm not sure this should be the case. Wouldn't the idea be to detect the same spatial correlation structure in a single batch as across all of them? I guess you want a Also, should a Moran's I of -1 and 0 be distinguished for the comparison at all? Both mean no biologically meaningful information is encoded, no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -1 means perfectly dispersed, if we are looking for a clustered signal we could do Wrt to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. EDIT: I changed it such as to only consider "bad" differences, please check in #304 |
||||
else: | ||||
res = avg_mi_diff | ||||
|
||||
else: | ||||
# Moran's I on integrated data | ||||
compute_mi(adata_post, hvgs) | ||||
res = adata_post.var["morans_i"].mean() | ||||
if rescale: | ||||
# Moran's I will be in [-1,1] | ||||
res = (res + 1) / 2 | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also here: should a Moran's I of 0 be better than one of -1? Could that difference only be due to sparsity of the marker? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed it to scores >0 as we are mostly interested in clusterings. |
||||
|
||||
return res |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import scib | ||
from tests.common import LOGGER, assert_near_exact | ||
|
||
|
||
def test_morans_i_nhvg(adata_neighbors): | ||
adata_int = adata_neighbors.copy() | ||
score = scib.me.morans_i( | ||
adata_pre=adata_neighbors, | ||
adata_post=adata_int, | ||
batch_key="batch", | ||
n_hvg=100, | ||
) | ||
LOGGER.info(f"score: {score}") | ||
expected_score = 0.65204 | ||
assert_near_exact(score, expected_score, diff=1e-5) | ||
|
||
|
||
def test_morans_i_hvgs(adata_neighbors): | ||
adata_int = adata_neighbors.copy() | ||
score = scib.me.morans_i( | ||
adata_pre=adata_neighbors, | ||
adata_post=adata_int, | ||
batch_key="batch", | ||
hvgs=[ | ||
"TNFRSF4", | ||
"TNFRSF1B", | ||
"EFHD2", | ||
"C1QA", | ||
"C1QB", | ||
"STMN1", | ||
"SMAP2", | ||
"PRDX1", | ||
"SCP2", | ||
"C1orf162", | ||
], | ||
) | ||
LOGGER.info(f"score: {score}") | ||
expected_score = 0.64337 | ||
assert_near_exact(score, expected_score, diff=1e-5) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
1000 hvgs sounds like a lot for this as default. Is there any reason for this? Can it be reduced?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree, this can be reduced - maybe to 100?