Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Milo neighborhood groups as in MiloR #680

Open
MaximilianNuber opened this issue Nov 21, 2024 · 0 comments
Open

Milo neighborhood groups as in MiloR #680

MaximilianNuber opened this issue Nov 21, 2024 · 0 comments
Labels
enhancement New feature or request

Comments

@MaximilianNuber
Copy link

Description of feature

Dear all,

I would like to ask for a feature in the Milo submodule: Grouping Neighborhoods, as implemented in MiloR (https://github.com/MarioniLab/miloR/blob/master/R/groupNhoods.R).
@emdann suggested that I request the feature for the pertpy implementation (see emdann/milopy#49).

As I am using the feature already to analyse my data I have code suggestions, should you be interested.
I adjusted the code and example to work with pertpy-Milo using the tutorial data, compare the neighborhood grouping results to the MiloR implementation, and included a suggestion for neighborhood group markers based on the pertpy differential expression module.
I would be very thankful for any help to correct/verify my code.

Best regards,
max

Image
Image

Marker genes from find_nhood_groups_markers for neighborhood groups 5 vs 4 in "Python Neighborhood groups":
Image

MiloR neighborhood groups:
Image

Neighborhood group functions:

def find_nhood_groups(
    mdata, 
    SpatialFDR_threshold = 0.1,
    merge_discord = False,
    max_lfc_delta=None,
    overlap=1,
    subset_nhoods: None | str = None,
):
    """Get igraph graph from adjacency matrix. Adjusted from scanpy louvain clustering."""
    nhs = mdata["rna"].obsm["nhoods"].copy()
    nhood_adj = mdata["milo"].varp["nhood_connectivities"].copy()
    da_res = mdata["milo"].var.copy()
    is_da = np.asarray(da_res.SpatialFDR <= SpatialFDR_threshold)

    if subset_nhoods is not None:
        print(da_res.shape)
        mask_da_res = (
            da_res
            .query(subset_nhoods)
            .copy()
        )
        mask = np.asarray([True if x in mask_da_res.index else False for x in da_res.index])
        da_res = da_res.loc[mask, :].copy()
        print(da_res.shape)
        # mask = da_res.index.values.astype(bool)
        print(mask.shape)
        nhs = nhs[:, mask].copy()
        print(mask.shape)
        print(nhood_adj.shape)
        nhood_adj = nhood_adj[:, mask]
        nhood_adj = nhood_adj[mask, :].copy()
        print(nhood_adj.shape)
        # is_da = np.asarray(da_res.SpatialFDR <= SpatialFDR_threshold)
        is_da = is_da[mask].copy()
        # print(is_da.shape)

    # da_res.loc[is_da, 'logFC'].values @ (da_res.loc[is_da, 'logFC']).T.values
    # print(da_res.loc[is_da, 'logFC'])

    if merge_discord is False:
        # discord_sign = np.sign(da_res.loc[is_da, 'logFC'].values @ (da_res.loc[is_da, 'logFC'])) < 0
        # Assuming da.res is a pandas DataFrame and is.da is a boolean array
        discord_sign = np.sign(da_res.loc[is_da, 'logFC'].values @ da_res.loc[is_da, 'logFC'].values.T) < 0
        # print(discord_sign.shape)
        # nhood_adj[is_da, is_da][discord_sign] <- 0
        # print(nhood_adj.shape)
        nhood_adj[(is_da, is_da)][discord_sign] = 0

    if overlap > 1:
        nhood_adj[nhood_adj < overlap] = 0

    if max_lfc_delta is not None:  
        lfc_diff = np.array([da_res['logFC'].values - x for x in da_res['logFC'].values])
        nhood_adj[np.abs(lfc_diff) > max_lfc_delta] = 0

    # binarise
    # nhood_adj = np.asarray((nhood_adj > 0) + 0)
    nhood_adj = (nhood_adj > 0).astype(int)

    g = sc._utils.get_igraph_from_adjacency(nhood_adj, directed = False)

    weights = None
    part = g.community_multilevel(weights=weights)

    groups = np.array(part.membership)
    groups = groups.astype(int).astype(str)

    if subset_nhoods is not None:
        mdata["milo"].var["nhood_groups"] = np.nan
        mdata["milo"].var.loc[mask, "nhood_groups"] = groups
        return None
    mdata["milo"].var["nhood_groups"] = groups

Nhood group markers:

from typing import Sequence
from lamin_utils import logger

class Limma(pt.tl._differential_gene_expression._base.LinearModelBase):

    def _check_counts(self):
        # check_is_integer_matrix(self.data)
        pass
        
    def fit(self, voom = False, **kwargs):

        """Fit model on normalized data using limma.

        Note: this creates its own AnnData object for downstream.

        Args:
            **kwargs: Keyword arguments specific to glmQLFit()
        """

        if voom:
            print("voom selected")
            self.voom_fit(**kwargs)
            return None
        
        from scipy.sparse import issparse
        # For running in notebook
        # pandas2ri.activate()
        # rpy2.robjects.numpy2ri.activate()
        try:
            import rpy2.robjects.numpy2ri
            import rpy2.robjects.pandas2ri
            from rpy2 import robjects as ro
            from rpy2.robjects import numpy2ri, pandas2ri
            from rpy2.robjects.conversion import localconverter
            from rpy2.robjects.packages import importr

            # pandas2ri.activate()
            # rpy2.robjects.numpy2ri.activate()

        except ImportError:
            raise ImportError("edger requires rpy2 to be installed.") from None

        try:
            edger = importr("edgeR")
            limma = importr("limma")
        except ImportError as e:
            raise ImportError(
                "edgeR requires a valid R installation with the following packages:\n"
                "edgeR, BiocParallel, RhpcBLASctl"
            ) from e

        # Convert dataframe
        with localconverter(ro.default_converter + numpy2ri.converter):
            expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
            if issparse(expr):
                expr = expr.T.toarray()
            else:
                expr = expr.T

        
        expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))

        r_obs = ro.conversion.py2rpy(self.adata.obs)
        # rbase = importr("base")

        with localconverter(default_converter + pandas2ri.converter):
            r_design = ro.conversion.py2rpy(self.design)

        self.fit = limma.lmFit(expr_r, r_design)

        # self.fit = limma.eBayes(fit)

    ### for the sake of completeness
    def voom_fit(self, voom_kwargs = {}, **lmFit_kwargs):

        """Fit model on count data using limma-voom.

        Note: this creates its own AnnData object for downstream.

        Args:
            **kwargs: Keyword arguments specific to glmQLFit()
        """

        print("Fitting limma with voom")
        
        from scipy.sparse import issparse
        # For running in notebook
        # pandas2ri.activate()
        # rpy2.robjects.numpy2ri.activate()
        try:
            import rpy2.robjects.numpy2ri
            import rpy2.robjects.pandas2ri
            from rpy2 import robjects as ro
            from rpy2.robjects import numpy2ri, pandas2ri
            from rpy2.robjects.conversion import localconverter
            from rpy2.robjects.packages import importr

            # pandas2ri.activate()
            # rpy2.robjects.numpy2ri.activate()

        except ImportError:
            raise ImportError("edger requires rpy2 to be installed.") from None

        try:
            edger = importr("edgeR")
            limma = importr("limma")
        except ImportError as e:
            raise ImportError(
                "!!limma requires a valid R installation with the following packages:\n"
                "edgeR, limma, BiocParallel, RhpcBLASctl"
            ) from e

        # Convert dataframe
        with localconverter(ro.default_converter + numpy2ri.converter):
            expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
            if issparse(expr):
                expr = expr.T.toarray()
            else:
                expr = expr.T

        
        expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))

        r_obs = ro.conversion.py2rpy(self.adata.obs)
        # rbase = importr("base")

        with localconverter(default_converter + pandas2ri.converter):
            r_design = ro.conversion.py2rpy(self.design)

        v = limma.voom(expr_r, r_design, **voom_kwargs)

        self.fit = limma.lmFit(v, r_design, **lmFit_kwargs)
            
    def _test_single_contrast(self, contrast, trend = False, robust = True, **kwargs):
        
        with localconverter(default_converter + pandas2ri.converter):
            r_design = ro.conversion.py2rpy(self.design)

        r_string = '''
        FUN = function(fit, contrast, design) {
            # contrast_matrix <- makeContrasts(contrast, levels=design)
            # print(contrast_matrix)
            contrast <- as.numeric(contrast)
            print(class(contrast))
            
            fit2 = limma::contrasts.fit(fit, contrast)
            print("check")
            efit <- limma::eBayes(fit2)
            print("check")
            # print(coef(efit))
            res <- limma::topTable(efit, coef = 1, n = Inf)
        }
        '''

        r_pkg = STAP(r_string, "r_pkg")

        res = r_pkg.FUN(self.fit, contrast, r_design)

        with localconverter(default_converter + pandas2ri.converter):
            de_res = pandas2ri.rpy2py(res)

        de_res.index.name = "variable"
        de_res = de_res.reset_index()

        de_res =  de_res.rename(columns={"P.Value": "p_value", "logFC": "log_fc", "adj.P.Val": "adj_p_value"})

        return de_res
def get_cells_in_nhoods(mdata, nhood_ids):
    '''
    Get cells in neighbourhoods of interest '''
    in_nhoods = np.array(mdata["rna"].obsm['nhoods'][:,nhood_ids.astype('int')].sum(1))
    ad = mdata["rna"][in_nhoods.astype(bool), :].copy()
    return ad
def _get_groups(
    mdata, 
    sample_col,
    nhood_group_obs = "nhood_groups",
    confounders_obs = [],
    pseudobulk = True,
    func = "sum",
    layer = "counts"
):

    layer = None if layer == "X" else layer
    
    ad_list = []
    groups = mdata["milo"].var[nhood_group_obs].dropna().unique()
    print(groups)

    mdata["milo"].var["_my_groups"] = mdata["milo"].var[nhood_group_obs]

    for g in groups:
        nhood_ids = (
            mdata["milo"].var
            .query('_my_groups == @g')
            .index.to_list()
        )
    
        nhood_ids = np.asarray(nhood_ids)
        cur_ad = get_cells_in_nhoods(mdata, nhood_ids)
        cur_ad.obs[nhood_group_obs] = g
        
        if pseudobulk:
            cur_ad = sc.get.aggregate(cur_ad, by = [sample_col, nhood_group_obs] + confounders_obs, func = func,
                                     layer=layer)
        
        ad_list.append(cur_ad)
    
    ad = sc.concat(ad_list)

    return ad

def find_nhood_group_markers(
    mdata, 
    sample_col = "patient_id",
    nhood_group_obs = "nhood_groups",
    design = "~0+nhood_groups",
    baseline = "non_enriched",
    group_to_compare = "enriched",
    confounders_obs = [],
    pseudobulk = True, 
    func = "mean",
    layer = "X",
    pb_model = Limma,
    return_model = False,
    **kwargs
):
    ad = _get_groups(
        mdata, 
        sample_col = sample_col,
        nhood_group_obs = nhood_group_obs,
        confounders_obs = confounders_obs,
        pseudobulk = pseudobulk, 
        func = func,
        layer = layer
    )

    ad.X = ad.layers[func]

    mod = pb_model(ad, design = design)
    mod.fit(**kwargs)
    res_df = mod._test_single_contrast(
        mod.contrast(nhood_group_obs, baseline = baseline, group_to_compare = group_to_compare)
    )
    if return_model:
        return res_df, mod

    return res_df

Code to reproduce the example plots:

import os
os.environ["R_HOME"] = r_env_path
import rpy2.rinterface_lib.callbacks
# import anndata2ri
import logging

from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri
from rpy2.robjects import r
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import default_converter
from rpy2.robjects.conversion import localconverter

from rpy2.robjects.packages import STAP

# sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
%load_ext rpy2.ipython
import numpy as np
import scanpy as sc
import seaborn as sns
# from scipy.stats import median_abs_deviation
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
import pertpy as pt

adata = pt.data.stephenson_2021_subsampled()
adata.obs["COVID_severity"] = adata.obs["Status_on_day_collection_summary"].copy()  # short name
adata.obs[["patient_id", "COVID_severity"]].drop_duplicates().value_counts("COVID_severity")

sc.pl.umap(adata, color=["author_cell_type"], legend_loc="on data")
sc.pl.umap(adata, color=["Site", "Status"], wspace=0.3, size=8)

## Exclude LPS samples
adata = adata[adata.obs["Status"] != "LPS"].copy()

## Initialize object for Milo analysis
milo = pt.tl.Milo()
mdata = milo.load(adata)

from sklearn_ann.kneighbors.annoy import AnnoyTransformer
transformer = AnnoyTransformer(n_neighbors=150)
sc.pp.neighbors(mdata["rna"], use_rep="X_scVI", transformer = transformer)

milo.make_nhoods(mdata["rna"], prop=0.1)

nhood_size = np.array(mdata["rna"].obsm["nhoods"].sum(0)).ravel()
plt.hist(nhood_size, bins=100)
plt.xlabel("# cells in nhood")
plt.ylabel("# nhoods");

milo.count_nhoods(mdata, sample_col="patient_id")

# Reorder categories
# (by default, the last category is taken as the condition of interest)
mdata["rna"].obs["Status"] = mdata["rna"].obs["Status"].cat.reorder_categories(["Healthy", "Covid"])
milo.da_nhoods(mdata, design="~Status")

old_figsize = plt.rcParams["figure.figsize"]
plt.rcParams["figure.figsize"] = [10, 5]
plt.subplot(1, 2, 1)
plt.hist(mdata["milo"].var.PValue, bins=50)
plt.xlabel("P-Vals")
plt.subplot(1, 2, 2)
plt.plot(mdata["milo"].var.logFC, -np.log10(mdata["milo"].var.SpatialFDR), ".")
plt.xlabel("log-Fold Change")
plt.ylabel("- log10(Spatial FDR)")
plt.tight_layout()
plt.rcParams["figure.figsize"] = old_figsize

milo.build_nhood_graph(mdata)

plt.rcParams["figure.figsize"] = [7, 7]
milo.plot_nhood_graph(
    mdata,
    alpha=0.1,  ## SpatialFDR level (1%)
    min_size=1,  ## Size of smallest dot
)

milo.annotate_nhoods(mdata, anno_col="author_cell_type")
plt.hist(mdata["milo"].var["nhood_annotation_frac"], bins=30)
plt.xlabel("celltype fraction")

mdata["milo"].var["nhood_annotation"] = mdata["milo"].var["nhood_annotation"].astype(str)
mdata["milo"].var.loc[mdata["milo"].var["nhood_annotation_frac"] < 0.6, "nhood_annotation"] = "Mixed"

milo.plot_da_beeswarm(mdata, alpha=0.1)

find_nhood_groups(mdata, # subset_nhoods = "nhood_annotation == 'CD83_CD14_mono'"
                 )


### Nhood groups beeswarm plot

milo.plot_da_beeswarm(
    mdata,
    anno_col = "nhood_groups",
)

### Python neighbor groups plot
from legendkit import cat_legend, size_legend, vstack
plt_df = pd.DataFrame(mdata["milo"].varm["X_milo_graph"],
                      columns = ["UMAP_"+str(i+1) for i in range(2)]
                     )

plt_df["nhood_groups"] = mdata["milo"].var["nhood_groups"].values
plt_df["nhood_groups"] = plt_df["nhood_groups"].astype(str)

plt_df["size"] = mdata["milo"].var["Nhood_size"].values
plt_df = plt_df.sort_values("nhood_groups")

label_df = label_df = (
    plt_df
    .groupby("nhood_groups")
    .median()
)

from legendkit import size_legend, cat_legend, vstack


# groups = plt_df["nhood_groups"].unique()
groups = []
[groups.append(n) for n in plt_df["nhood_groups"] if n not in groups]
print(groups)

palette = [color for color in sns.color_palette("Paired", 12) for _ in range(len(groups))]
scale_factor = 0.2
sns.scatterplot(
    x=plt_df["UMAP_1"],
    y=plt_df["UMAP_2"],
    size = plt_df["size"].values * scale_factor, #.apply(lambda x: x*0.2),
    hue = plt_df["nhood_groups"],#.astype("category").cat.codes,
    legend = False,
    linewidth=0,
    palette = sns.color_palette("Paired", 12)
)
sns.despine()
slegend = size_legend(mdata["milo"].var["Nhood_size"].values * scale_factor, loc = "out right center",
           title = "Nhood size", func = lambda x: x*(1/scale_factor))

color = dict(
    colors = sns.color_palette("Paired", 12),
    labels = groups
)

ax = plt.gca()
ax.set_title("Python Neighborhood groups", fontsize = 18)
handles, labels = ax.get_legend_handles_labels()

clegend = cat_legend(**color, title = "Nhood group", handle = "circle")
vstack([clegend, slegend], # title="Stack Vertically",
       loc="out right center", spacing=10, frameon=False, ax=ax)

for _group, row in label_df.iterrows():
    plt.text(row["UMAP_1"], row["UMAP_2"], _group, fontsize = 15)

plt.show()
###############

import anndata2ri
mdata = mdata.copy()
adata = pt.data.stephenson_2021_subsampled()
adata = adata[adata.obs["Status"] != "LPS"].copy()

da_res = mdata["milo"].var.copy()
da_res = (
    da_res.drop("nhood_groups", axis = 1)
)

with localconverter(anndata2ri.converter):
    r.assign("sce", adata)
    r.assign("nhoods", mdata["rna"].obsm["nhoods"])
    r.assign("nhoodCounts", mdata["milo"].X.T)
    r.assign("nhood_adjacency", mdata["milo"].varp["nhood_connectivities"])
    r.assign("da_results", da_res)
    r.assign("nhood_graph", mdata["milo"].varm["X_milo_graph"])
%%R -o da_results
suppressMessages(library(SingleCellExperiment))
suppressMessages(library(scran))
suppressMessages(library(miloR))
suppressMessages(library(Matrix))

milo <- Milo(sce)
nhoodAdjacency(milo) <- nhood_adjacency
nhoodCounts(milo) <- nhoodCounts
nhoods(milo) <- as(nhoods, "CsparseMatrix")
nhoodGraph(milo) <- nhood_graph

colnames(nhoodAdjacency(milo)) <- da_results$index_cell
rownames(nhoodAdjacency(milo)) <- da_results$index_cell

da_results <- groupNhoods(milo, da_results)

### MiloR nhood groups plot

mdata["milo"].var["nhood_groups_miloR"] = da_results["NhoodGroup"]

from legendkit import cat_legend, size_legend, vstack
plt_df = pd.DataFrame(mdata["milo"].varm["X_milo_graph"],
                      columns = ["UMAP_"+str(i+1) for i in range(2)]
                     )

plt_df["nhood_groups"] = mdata["milo"].var["nhood_groups_miloR"].values
plt_df.loc[plt_df.nhood_groups.isnull(), "nhood_groups"] = np.nan
plt_df["nhood_groups"] = plt_df["nhood_groups"].astype(str)

plt_df["size"] = mdata["milo"].var["Nhood_size"].values
plt_df = plt_df.sort_values("nhood_groups", na_position = "first")

label_df = label_df = (
    plt_df
    .groupby("nhood_groups")
    .median()
)

from legendkit import size_legend, cat_legend, vstack


# groups = plt_df["nhood_groups"].unique()
groups = []
[groups.append(n) for n in plt_df["nhood_groups"] if n not in groups]
print(groups)

palette = [color for color in sns.color_palette("Paired", 12) for _ in range(len(groups))]
scale_factor = 0.2
sns.scatterplot(
    x=plt_df["UMAP_1"],
    y=plt_df["UMAP_2"],
    size = plt_df["size"].values * scale_factor, #.apply(lambda x: x*0.2),
    hue = plt_df["nhood_groups"],#.astype("category").cat.codes,
    legend = False,
    linewidth=0,
    palette = sns.color_palette("Paired", 12)
)
sns.despine()
slegend = size_legend(mdata["milo"].var["Nhood_size"].values * scale_factor, loc = "out right center",
           title = "Nhood size", func = lambda x: x*(1/scale_factor))

color = dict(
    colors = sns.color_palette("Paired", 12),
    labels = groups
)

ax = plt.gca()
ax.set_title("MiloR Neighborhood groups", fontsize = 18)
handles, labels = ax.get_legend_handles_labels()

clegend = cat_legend(**color, title = "Nhood group", handle = "circle")
vstack([clegend, slegend], # title="Stack Vertically",
       loc="out right center", spacing=10, frameon=False, ax=ax)

for _group, row in label_df.iterrows():
    plt.text(row["UMAP_1"], row["UMAP_2"], _group, fontsize = 15)

plt.show()
mdata["milo"].var.loc[~mdata["milo"].var["nhood_groups"].isin(["4", "5"]), "nhood_groups"] = np.nan

### uses the above Limma class by default, but can be switched out for pt.tl.EdgeR when a counts layer exists.
res_df, mod = find_nhood_group_markers(mdata, 
                        sample_col = "patient_id",
                        baseline = "4", group_to_compare = "5",
                        return_model = True,
                                      voom = True)

### Volcano Plot:
mod.plot_volcano(res_df, log2fc_thresh = 0.1,legend_pos = (1.3, 1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement New feature or request
Projects
None yet
Development

No branches or pull requests

1 participant