SCVI Extra Covariates¶

First, we train scvi with only batch covariates on a dataset of Drosophila myoblast cells and show that cell sex and cell cycle confound our latent space.
Then, we train scvi while conditioning on the gene expression from cell sex and cell cycle marker genes. We show that scvi is able to deconfound cell sex and cell cycle effects while preserving signal from the other genes.

In [1]:

%%capture
import sys

BRANCH = 'master'
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !pip install --quiet --upgrade jsonschema
    !pip install --quiet multipledispatch
    !pip install --quiet scvi-tools[tutorials]==0.9.0
    !pip install --quiet statannotations
    !pip install --quiet gdown
    !pip install --quiet bbknn
%%capture
import sys

BRANCH = 'master'
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !pip install --quiet --upgrade jsonschema
    !pip install --quiet multipledispatch
    !pip install --quiet scvi-tools[tutorials]==0.9.0
    !pip install --quiet statannotations
    !pip install --quiet gdown
    !pip install --quiet bbknn

In [2]:

import scvi
import anndata
import pandas as pd
import scanpy as sc
import numpy as np
import os
import seaborn as sns 
import matplotlib.pyplot as plt
import plotnine as p9
from statannot import add_stat_annotation
import gdown
from scipy.stats import ranksums

sc.settings.n_jobs=10
import scvi
import anndata
import pandas as pd
import scanpy as sc
import numpy as np
import os
import seaborn as sns 
import matplotlib.pyplot as plt
import plotnine as p9
from statannot import add_stat_annotation
import gdown
from scipy.stats import ranksums

sc.settings.n_jobs=10

In [3]:

sns.reset_orig()
sc.settings._vector_friendly = True
# p9.theme_set(p9.theme_classic)
plt.rcParams["svg.fonttype"] = "none"
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["savefig.transparent"] = True
plt.rcParams["figure.figsize"] = (4, 4)

plt.rcParams["axes.titlesize"] = 15
plt.rcParams["axes.titleweight"] = 500
plt.rcParams["axes.titlepad"] = 8.0
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["axes.labelweight"] = 500
plt.rcParams["axes.linewidth"] = 1.2
plt.rcParams["axes.labelpad"] = 6.0
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False


plt.rcParams["font.size"] = 11
# plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Helvetica', "Computer Modern Sans Serif", "DejaVU Sans"]
plt.rcParams['font.weight'] = 500

plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['xtick.minor.size'] = 1.375
plt.rcParams['xtick.major.size'] = 2.75
plt.rcParams['xtick.major.pad'] = 2
plt.rcParams['xtick.minor.pad'] = 2

plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['ytick.minor.size'] = 1.375
plt.rcParams['ytick.major.size'] = 2.75
plt.rcParams['ytick.major.pad'] = 2
plt.rcParams['ytick.minor.pad'] = 2

plt.rcParams["legend.fontsize"] = 12
plt.rcParams['legend.handlelength'] = 1.4
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['legend.scatterpoints'] = 3
plt.rcParams['legend.frameon'] = False

plt.rcParams['lines.linewidth'] = 1.7

DPI = 300
sns.reset_orig()
sc.settings._vector_friendly = True
# p9.theme_set(p9.theme_classic)
plt.rcParams["svg.fonttype"] = "none"
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["savefig.transparent"] = True
plt.rcParams["figure.figsize"] = (4, 4)

plt.rcParams["axes.titlesize"] = 15
plt.rcParams["axes.titleweight"] = 500
plt.rcParams["axes.titlepad"] = 8.0
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["axes.labelweight"] = 500
plt.rcParams["axes.linewidth"] = 1.2
plt.rcParams["axes.labelpad"] = 6.0
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False


plt.rcParams["font.size"] = 11
# plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Helvetica', "Computer Modern Sans Serif", "DejaVU Sans"]
plt.rcParams['font.weight'] = 500

plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['xtick.minor.size'] = 1.375
plt.rcParams['xtick.major.size'] = 2.75
plt.rcParams['xtick.major.pad'] = 2
plt.rcParams['xtick.minor.pad'] = 2

plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['ytick.minor.size'] = 1.375
plt.rcParams['ytick.major.size'] = 2.75
plt.rcParams['ytick.major.pad'] = 2
plt.rcParams['ytick.minor.pad'] = 2

plt.rcParams["legend.fontsize"] = 12
plt.rcParams['legend.handlelength'] = 1.4
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['legend.scatterpoints'] = 3
plt.rcParams['legend.frameon'] = False

plt.rcParams['lines.linewidth'] = 1.7

DPI = 300

In [4]:

# Geary's C code from https://github.com/theislab/scanpy/pull/915
# TODO: remove this once above PR is merged into scanpy 1.8


from typing import Optional, Union

from anndata import AnnData
from multipledispatch import dispatch
import numba
import numpy as np
import pandas as pd
from scipy import sparse


def _choose_obs_rep(adata, *, use_raw=False, layer=None, obsm=None, obsp=None):
    """
    Choose array aligned with obs annotation.
    """
    is_layer = layer is not None
    is_raw = use_raw is not False
    is_obsm = obsm is not None
    is_obsp = obsp is not None
    choices_made = sum((is_layer, is_raw, is_obsm, is_obsp))
    assert choices_made <= 1
    if choices_made == 0:
        return adata.X
    elif is_layer:
        return adata.layers[layer]
    elif use_raw:
        return adata.raw.X
    elif is_obsm:
        return adata.obsm[obsm]
    elif is_obsp:
        return adata.obsp[obsp]
    else:
        assert False, (
            "That was unexpected. Please report this bug at:\n\n\t"
            " https://github.com/theislab/scanpy/issues"
        )



###############################################################################
# Calculation
###############################################################################
# Some notes on the implementation:
# * This could be phrased as tensor multiplication. However that does not get
#   parallelized, which boosts performance almost linearly with cores.
# * Due to the umap setting the default threading backend, a parallel numba
#   function that calls another parallel numba function can get stuck. This
#   ends up meaning code re-use will be limited until umap 0.4.
#   See: https://github.com/lmcinnes/umap/issues/306
# * There can be a fair amount of numerical instability here (big reductions),
#   so data is cast to float64. Removing these casts/ conversion will cause the
#   tests to fail.


@numba.njit(cache=False, parallel=False)
def _gearys_c_vec(data, indices, indptr, x):
    W = data.sum()
    return _gearys_c_vec_W(data, indices, indptr, x, W)


@numba.njit(cache=False, parallel=False)
def _gearys_c_vec_W(data, indices, indptr, x, W):
    N = len(indptr) - 1
    x_bar = x.mean()
    x = x.astype(np.float_)

    total = 0.0
    for i in numba.prange(N):
        s = slice(indptr[i], indptr[i + 1])
        i_indices = indices[s]
        i_data = data[s]
        total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))

    numer = (N - 1) * total
    denom = 2 * W * ((x - x_bar) ** 2).sum()
    C = numer / denom

    return C


@numba.njit(cache=False, parallel=False)
def _gearys_c_mtx_csr(
    g_data, g_indices, g_indptr, x_data, x_indices, x_indptr, x_shape
):
    M, N = x_shape
    W = g_data.sum()
    out = np.zeros(M, dtype=np.float_)
    for k in numba.prange(M):
        x_k = np.zeros(N, dtype=np.float_)
        sk = slice(x_indptr[k], x_indptr[k + 1])
        x_k_data = x_data[sk]
        x_k[x_indices[sk]] = x_k_data
        x_k_bar = np.sum(x_k_data) / N
        total = 0.0
        for i in numba.prange(N):
            s = slice(g_indptr[i], g_indptr[i + 1])
            i_indices = g_indices[s]
            i_data = g_data[s]
            total += np.sum(i_data * ((x_k[i] - x_k[i_indices]) ** 2))
        numer = (N - 1) * total
        # Expanded from 2 * W * ((x_k - x_k_bar) ** 2).sum(), but uses sparsity
        # to skip some calculations
        # fmt: off
        denom = (
            2 * W
            * (
                np.sum(x_k_data ** 2)
                - np.sum(x_k_data * x_k_bar * 2)
                + (x_k_bar ** 2) * N
            )
        )
        C = numer / denom
        out[k] = C
    return out


# Simplified implementation, hits race condition after umap import due to numba
# parallel backend
# @numba.njit(cache=True, parallel=True)
# def _gearys_c_mtx_csr(
#     g_data, g_indices, g_indptr, x_data, x_indices, x_indptr, x_shape
# ):
#     M, N = x_shape
#     W = g_data.sum()
#     out = np.zeros(M, dtype=np.float64)
#     for k in numba.prange(M):
#         x_arr = np.zeros(N, dtype=x_data.dtype)
#         sk = slice(x_indptr[k], x_indptr[k + 1])
#         x_arr[x_indices[sk]] = x_data[sk]
#         outval = _gearys_c_vec_W(g_data, g_indices, g_indptr, x_arr, W)
#         out[k] = outval
#     return out


@numba.njit(cache=False, parallel=False)
def _gearys_c_mtx(g_data, g_indices, g_indptr, X):
    M, N = X.shape
    W = g_data.sum()
    out = np.zeros(M, dtype=np.float_)
    for k in numba.prange(M):
        x = X[k, :].astype(np.float_)
        x_bar = x.mean()

        total = 0.0
        for i in numba.prange(N):
            s = slice(g_indptr[i], g_indptr[i + 1])
            i_indices = g_indices[s]
            i_data = g_data[s]
            total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))

        numer = (N - 1) * total
        denom = 2 * W * ((x - x_bar) ** 2).sum()
        C = numer / denom

        out[k] = C
    return out


# Similar to above, simplified version umaps choice of parallel backend breaks:
# @numba.njit(cache=True, parallel=True)
# def _gearys_c_mtx(g_data, g_indices, g_indptr, X):
#     M, N = X.shape
#     W = g_data.sum()
#     out = np.zeros(M, dtype=np.float64)
#     for k in numba.prange(M):
#         outval = _gearys_c_vec_W(g_data, g_indices, g_indptr, X[k, :], W)
#         out[k] = outval
#     return out


###############################################################################
# Interface
###############################################################################


@dispatch(sparse.csr_matrix, sparse.csr_matrix)
def gearys_c(g, vals) -> np.ndarray:
    assert g.shape[0] == g.shape[1], "`g` should be a square adjacency matrix"
    assert g.shape[0] == vals.shape[1]
    return _gearys_c_mtx_csr(
        g.data.astype(np.float_, copy=False),
        g.indices,
        g.indptr,
        vals.data.astype(np.float_, copy=False),
        vals.indices,
        vals.indptr,
        vals.shape,
    )


@dispatch(sparse.spmatrix, np.ndarray)  # noqa
def gearys_c(g, vals):
    """\
    Params
    ------
    g
        Connectivity graph as a scipy sparse matrix. Should have shape:
        `(n_obs, n_obs)`.
    vals
        Values to calculate Geary's C for. If one dimensional, should have
        shape `(n_obs,)`. If two dimensional (i.e calculating Geary's C for
        multiple variables) should have shape `(n_vars, n_obs)`.
    """
    assert g.shape[0] == g.shape[1], "`g` should be a square matrix."
    if not isinstance(g, sparse.csr_matrix):
        g = g.tocsr()
    g_data = g.data.astype(np.float_, copy=False)
    if vals.ndim == 1:
        assert g.shape[0] == vals.shape[0]
        return _gearys_c_vec(g_data, g.indices, g.indptr, vals)
    elif vals.ndim == 2:
        assert g.shape[0] == vals.shape[1]
        return _gearys_c_mtx(g_data, g.indices, g.indptr, vals)
    else:
        raise ValueError()


@dispatch(sparse.spmatrix, (pd.DataFrame, pd.Series))  # noqa
def gearys_c(g, vals):
    return gearys_c(g, vals.values)


@dispatch(sparse.spmatrix, sparse.spmatrix)  # noqa
def gearys_c(g, vals) -> np.ndarray:
    if not isinstance(g, sparse.csr_matrix):
        g = g.tocsr()
    if not isinstance(vals, sparse.csr_matrix):
        vals = vals.tocsr()
    return gearys_c(g, vals)


# TODO: Document better
# TODO: Have scanpydoc work with multipledispatch
@dispatch(AnnData)  # noqa
def gearys_c(
    adata: AnnData,
    *,
    vals: Optional[Union[np.ndarray, sparse.spmatrix]] = None,
    use_graph: Optional[str] = None,
    layer: Optional[str] = None,
    obsm: Optional[str] = None,
    obsp: Optional[str] = None,
    use_raw: bool = False,
) -> Union[np.ndarray, float]:
    """\
    Calculate `Geary's C` <https://en.wikipedia.org/wiki/Geary's_C>`_, as used
    by `VISION <https://doi.org/10.1038/s41467-019-12235-0>`_.
    Geary's C is a measure of autocorrelation for some measure on a graph. This
    can be to whether measures are correlated between neighboring cells. Lower
    values indicate greater correlation.
    ..math
        C =
        \frac{
            (N - 1)\sum_{i,j} w_{i,j} (x_i - x_j)^2
        }{
            2W \sum_i (x_i - \bar{x})^2
        }
    Params
    ------
    adata
    vals
        Values to calculate Geary's C for. If this is two dimensional, should
        be of shape `(n_features, n_cells)`. Otherwise should be of shape
        `(n_cells,)`. This matrix can be selected from elements of the anndata
        object by using key word arguments: `layer`, `obsm`, `obsp`, or
        `use_raw`.
    use_graph
        Key to use for graph in anndata object. If not provided, default
        neighbors connectivities will be used instead.
    layer
        Key for `adata.layers` to choose `vals`.
    obsm
        Key for `adata.obsm` to choose `vals`.
    obsp
        Key for `adata.obsp` to choose `vals`.
    use_raw
        Whether to use `adata.raw.X` for `vals`.
    Returns
    -------
    If vals is two dimensional, returns a 1 dimensional ndarray array. Returns
    a scalar if `vals` is 1d.
    """
    if use_graph is None:
        # Fix for anndata<0.7
        if hasattr(adata, "obsp") and "connectivities" in adata.obsp:
            g = adata.obsp["connectivities"]
        elif "neighbors" in adata.uns:
            g = adata.uns["neighbors"]["connectivities"]
        else:
            raise ValueError("Must run neighbors first.")
    else:
        raise NotImplementedError()
    if vals is None:
        vals = _choose_obs_rep(
            adata, use_raw=use_raw, layer=layer, obsm=obsm, obsp=obsp
        ).T
    return gearys_c(g, vals)
# Geary's C code from https://github.com/theislab/scanpy/pull/915
# TODO: remove this once above PR is merged into scanpy 1.8


from typing import Optional, Union

from anndata import AnnData
from multipledispatch import dispatch
import numba
import numpy as np
import pandas as pd
from scipy import sparse


def _choose_obs_rep(adata, *, use_raw=False, layer=None, obsm=None, obsp=None):
    """
    Choose array aligned with obs annotation.
    """
    is_layer = layer is not None
    is_raw = use_raw is not False
    is_obsm = obsm is not None
    is_obsp = obsp is not None
    choices_made = sum((is_layer, is_raw, is_obsm, is_obsp))
    assert choices_made <= 1
    if choices_made == 0:
        return adata.X
    elif is_layer:
        return adata.layers[layer]
    elif use_raw:
        return adata.raw.X
    elif is_obsm:
        return adata.obsm[obsm]
    elif is_obsp:
        return adata.obsp[obsp]
    else:
        assert False, (
            "That was unexpected. Please report this bug at:\n\n\t"
            " https://github.com/theislab/scanpy/issues"
        )



###############################################################################
# Calculation
###############################################################################
# Some notes on the implementation:
# * This could be phrased as tensor multiplication. However that does not get
#   parallelized, which boosts performance almost linearly with cores.
# * Due to the umap setting the default threading backend, a parallel numba
#   function that calls another parallel numba function can get stuck. This
#   ends up meaning code re-use will be limited until umap 0.4.
#   See: https://github.com/lmcinnes/umap/issues/306
# * There can be a fair amount of numerical instability here (big reductions),
#   so data is cast to float64. Removing these casts/ conversion will cause the
#   tests to fail.


@numba.njit(cache=False, parallel=False)
def _gearys_c_vec(data, indices, indptr, x):
    W = data.sum()
    return _gearys_c_vec_W(data, indices, indptr, x, W)


@numba.njit(cache=False, parallel=False)
def _gearys_c_vec_W(data, indices, indptr, x, W):
    N = len(indptr) - 1
    x_bar = x.mean()
    x = x.astype(np.float_)

    total = 0.0
    for i in numba.prange(N):
        s = slice(indptr[i], indptr[i + 1])
        i_indices = indices[s]
        i_data = data[s]
        total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))

    numer = (N - 1) * total
    denom = 2 * W * ((x - x_bar) ** 2).sum()
    C = numer / denom

    return C


@numba.njit(cache=False, parallel=False)
def _gearys_c_mtx_csr(
    g_data, g_indices, g_indptr, x_data, x_indices, x_indptr, x_shape
):
    M, N = x_shape
    W = g_data.sum()
    out = np.zeros(M, dtype=np.float_)
    for k in numba.prange(M):
        x_k = np.zeros(N, dtype=np.float_)
        sk = slice(x_indptr[k], x_indptr[k + 1])
        x_k_data = x_data[sk]
        x_k[x_indices[sk]] = x_k_data
        x_k_bar = np.sum(x_k_data) / N
        total = 0.0
        for i in numba.prange(N):
            s = slice(g_indptr[i], g_indptr[i + 1])
            i_indices = g_indices[s]
            i_data = g_data[s]
            total += np.sum(i_data * ((x_k[i] - x_k[i_indices]) ** 2))
        numer = (N - 1) * total
        # Expanded from 2 * W * ((x_k - x_k_bar) ** 2).sum(), but uses sparsity
        # to skip some calculations
        # fmt: off
        denom = (
            2 * W
            * (
                np.sum(x_k_data ** 2)
                - np.sum(x_k_data * x_k_bar * 2)
                + (x_k_bar ** 2) * N
            )
        )
        C = numer / denom
        out[k] = C
    return out


# Simplified implementation, hits race condition after umap import due to numba
# parallel backend
# @numba.njit(cache=True, parallel=True)
# def _gearys_c_mtx_csr(
#     g_data, g_indices, g_indptr, x_data, x_indices, x_indptr, x_shape
# ):
#     M, N = x_shape
#     W = g_data.sum()
#     out = np.zeros(M, dtype=np.float64)
#     for k in numba.prange(M):
#         x_arr = np.zeros(N, dtype=x_data.dtype)
#         sk = slice(x_indptr[k], x_indptr[k + 1])
#         x_arr[x_indices[sk]] = x_data[sk]
#         outval = _gearys_c_vec_W(g_data, g_indices, g_indptr, x_arr, W)
#         out[k] = outval
#     return out


@numba.njit(cache=False, parallel=False)
def _gearys_c_mtx(g_data, g_indices, g_indptr, X):
    M, N = X.shape
    W = g_data.sum()
    out = np.zeros(M, dtype=np.float_)
    for k in numba.prange(M):
        x = X[k, :].astype(np.float_)
        x_bar = x.mean()

        total = 0.0
        for i in numba.prange(N):
            s = slice(g_indptr[i], g_indptr[i + 1])
            i_indices = g_indices[s]
            i_data = g_data[s]
            total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))

        numer = (N - 1) * total
        denom = 2 * W * ((x - x_bar) ** 2).sum()
        C = numer / denom

        out[k] = C
    return out


# Similar to above, simplified version umaps choice of parallel backend breaks:
# @numba.njit(cache=True, parallel=True)
# def _gearys_c_mtx(g_data, g_indices, g_indptr, X):
#     M, N = X.shape
#     W = g_data.sum()
#     out = np.zeros(M, dtype=np.float64)
#     for k in numba.prange(M):
#         outval = _gearys_c_vec_W(g_data, g_indices, g_indptr, X[k, :], W)
#         out[k] = outval
#     return out


###############################################################################
# Interface
###############################################################################


@dispatch(sparse.csr_matrix, sparse.csr_matrix)
def gearys_c(g, vals) -> np.ndarray:
    assert g.shape[0] == g.shape[1], "`g` should be a square adjacency matrix"
    assert g.shape[0] == vals.shape[1]
    return _gearys_c_mtx_csr(
        g.data.astype(np.float_, copy=False),
        g.indices,
        g.indptr,
        vals.data.astype(np.float_, copy=False),
        vals.indices,
        vals.indptr,
        vals.shape,
    )


@dispatch(sparse.spmatrix, np.ndarray)  # noqa
def gearys_c(g, vals):
    """\
    Params
    ------
    g
        Connectivity graph as a scipy sparse matrix. Should have shape:
        `(n_obs, n_obs)`.
    vals
        Values to calculate Geary's C for. If one dimensional, should have
        shape `(n_obs,)`. If two dimensional (i.e calculating Geary's C for
        multiple variables) should have shape `(n_vars, n_obs)`.
    """
    assert g.shape[0] == g.shape[1], "`g` should be a square matrix."
    if not isinstance(g, sparse.csr_matrix):
        g = g.tocsr()
    g_data = g.data.astype(np.float_, copy=False)
    if vals.ndim == 1:
        assert g.shape[0] == vals.shape[0]
        return _gearys_c_vec(g_data, g.indices, g.indptr, vals)
    elif vals.ndim == 2:
        assert g.shape[0] == vals.shape[1]
        return _gearys_c_mtx(g_data, g.indices, g.indptr, vals)
    else:
        raise ValueError()


@dispatch(sparse.spmatrix, (pd.DataFrame, pd.Series))  # noqa
def gearys_c(g, vals):
    return gearys_c(g, vals.values)


@dispatch(sparse.spmatrix, sparse.spmatrix)  # noqa
def gearys_c(g, vals) -> np.ndarray:
    if not isinstance(g, sparse.csr_matrix):
        g = g.tocsr()
    if not isinstance(vals, sparse.csr_matrix):
        vals = vals.tocsr()
    return gearys_c(g, vals)


# TODO: Document better
# TODO: Have scanpydoc work with multipledispatch
@dispatch(AnnData)  # noqa
def gearys_c(
    adata: AnnData,
    *,
    vals: Optional[Union[np.ndarray, sparse.spmatrix]] = None,
    use_graph: Optional[str] = None,
    layer: Optional[str] = None,
    obsm: Optional[str] = None,
    obsp: Optional[str] = None,
    use_raw: bool = False,
) -> Union[np.ndarray, float]:
    """\
    Calculate `Geary's C` `_, as used
    by `VISION `_.
    Geary's C is a measure of autocorrelation for some measure on a graph. This
    can be to whether measures are correlated between neighboring cells. Lower
    values indicate greater correlation.
    ..math
        C =
        \frac{
            (N - 1)\sum_{i,j} w_{i,j} (x_i - x_j)^2
        }{
            2W \sum_i (x_i - \bar{x})^2
        }
    Params
    ------
    adata
    vals
        Values to calculate Geary's C for. If this is two dimensional, should
        be of shape `(n_features, n_cells)`. Otherwise should be of shape
        `(n_cells,)`. This matrix can be selected from elements of the anndata
        object by using key word arguments: `layer`, `obsm`, `obsp`, or
        `use_raw`.
    use_graph
        Key to use for graph in anndata object. If not provided, default
        neighbors connectivities will be used instead.
    layer
        Key for `adata.layers` to choose `vals`.
    obsm
        Key for `adata.obsm` to choose `vals`.
    obsp
        Key for `adata.obsp` to choose `vals`.
    use_raw
        Whether to use `adata.raw.X` for `vals`.
    Returns
    -------
    If vals is two dimensional, returns a 1 dimensional ndarray array. Returns
    a scalar if `vals` is 1d.
    """
    if use_graph is None:
        # Fix for anndata<0.7
        if hasattr(adata, "obsp") and "connectivities" in adata.obsp:
            g = adata.obsp["connectivities"]
        elif "neighbors" in adata.uns:
            g = adata.uns["neighbors"]["connectivities"]
        else:
            raise ValueError("Must run neighbors first.")
    else:
        raise NotImplementedError()
    if vals is None:
        vals = _choose_obs_rep(
            adata, use_raw=use_raw, layer=layer, obsm=obsm, obsp=obsp
        ).T
    return gearys_c(g, vals)

In [5]:

# # Download the data
# import gdown
# url = 'https://drive.google.com/uc?id=1XwPlnFK55Hzf5wI2Cvy1wFAgjPMsKX1q'
# output = 'data/myoblasts.h5ad'
# gdown.download(url, output, quiet=False)

!mkdir data
!wget https://ndownloader.figshare.com/files/27458846 -O data/myoblasts.h5ad
# # Download the data
# import gdown
# url = 'https://drive.google.com/uc?id=1XwPlnFK55Hzf5wI2Cvy1wFAgjPMsKX1q'
# output = 'data/myoblasts.h5ad'
# gdown.download(url, output, quiet=False)

!mkdir data
!wget https://ndownloader.figshare.com/files/27458846 -O data/myoblasts.h5ad

mkdir: cannot create directory ‘data’: File exists
--2021-10-13 10:14:19--  https://ndownloader.figshare.com/files/27458846
Resolving ndownloader.figshare.com (ndownloader.figshare.com)... 52.16.102.173, 54.217.124.219, 2a05:d018:1f4:d000:b283:27aa:b939:8ed4, ...
Connecting to ndownloader.figshare.com (ndownloader.figshare.com)|52.16.102.173|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/27458846/myoblasts.h5ad?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=bc07a208b15134267bd800bd688e3eb89dbda148f284f6c90d55b5ce6812d8a4&X-Amz-Date=20211013T171420Z&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20211013/eu-west-1/s3/aws4_request [following]
--2021-10-13 10:14:20--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/27458846/myoblasts.h5ad?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=bc07a208b15134267bd800bd688e3eb89dbda148f284f6c90d55b5ce6812d8a4&X-Amz-Date=20211013T171420Z&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20211013/eu-west-1/s3/aws4_request
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.62.99
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.62.99|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 270689193 (258M) [application/octet-stream]
Saving to: ‘data/myoblasts.h5ad’

data/myoblasts.h5ad 100%[===================>] 258.15M  8.31MB/s    in 34s     

2021-10-13 10:14:55 (7.53 MB/s) - ‘data/myoblasts.h5ad’ saved [270689193/270689193]

Setting up data¶

In [6]:

# list of sex genes and cell cycle genes we will be conditioning on
sex_genes = ["lncRNA:roX1", "lncRNA:roX2", "Sxl", "msl-2"]
# cell_cycle_genes = ['PCNA', 'dnk', 'RnrS', 'RnrL', 'Claspin', 'Mcm5', 'Caf1-180',
#        'RPA2', 'HipHop', 'stg', 'Mcm6', 'dup', 'WRNexo', 'Mcm7', 'dpa',
#        'CG10336', 'Mcm3', 'Mcm2', 'RpA-70', 'Chrac-14', 'CG13690', 'RPA3',
#        'asf1', 'DNApol-alpha73', 'CycE', 'DNApol-alpha50', 'Kmn1', 'Lam',
#        'Nph', 'msd5', 'msd1', 'ctp', 'Set', 'scra', 'Chrac-16', 'ncd',
#        'Ote', 'pzg', 'HDAC1', 'nesd', 'tum', 'CG8173', 'aurB', 'feo',
#        'pav', 'CG6767', 'sip2', 'Det', 'Cks30A', 'CycB', 'B52']

cell_cycle_genes = ['PCNA', 'dnk', 'RnrS', 'RnrL', 'Claspin', 'Mcm5', 'Caf1-180',
       'RPA2', 'HipHop', 'stg', 'Mcm6', 'dup', 'WRNexo', 'Mcm7', 'dpa',
       'CG10336', 'Mcm3', 'Mcm2', 'RpA-70', 'Chrac-14', 'CG13690', 'RPA3',
       'asf1', 'DNApol-alpha73', 'CycE', 'DNApol-alpha50',
       'Kmn1', 'Lam', 'Nph', 'msd5', 'msd1', 'ctp', 'Set', 'scra',
       'Chrac-16', 'ncd', 'Ote', 'pzg', 'HDAC1', 'nesd', 'tum', 'CG8173',
       'aurB', 'feo', 'pav', 'CG6767', 'sip2', 'Det', 'Cks30A', 'CycB',
       'B52']

# list of genes we're interested in
genes_of_interest = [
    "Argk",
    "Nrt",
    "Ten-a",
    "Ten-m",
    "wb",
    "Act57B",
    "drl",
    "mid",
    "nemy",
    "lms",
    "CG11835",
    "Gyg",
    "ara",
    "tok",
    "kirre",
    "NK7.1",
    "fj",
    "beat-IIIc",
    "CG33993",
    "dpr16",
    "CG15529",
    "CG9593",
    "beat-IIb",
    "robo2",
    "Ama",
    "fz2",
    "elB",
    "noc",
    "nkd",
    "fng",
    "vg",
]

nuisance_genes = sex_genes + cell_cycle_genes
# list of sex genes and cell cycle genes we will be conditioning on
sex_genes = ["lncRNA:roX1", "lncRNA:roX2", "Sxl", "msl-2"]
# cell_cycle_genes = ['PCNA', 'dnk', 'RnrS', 'RnrL', 'Claspin', 'Mcm5', 'Caf1-180',
#        'RPA2', 'HipHop', 'stg', 'Mcm6', 'dup', 'WRNexo', 'Mcm7', 'dpa',
#        'CG10336', 'Mcm3', 'Mcm2', 'RpA-70', 'Chrac-14', 'CG13690', 'RPA3',
#        'asf1', 'DNApol-alpha73', 'CycE', 'DNApol-alpha50', 'Kmn1', 'Lam',
#        'Nph', 'msd5', 'msd1', 'ctp', 'Set', 'scra', 'Chrac-16', 'ncd',
#        'Ote', 'pzg', 'HDAC1', 'nesd', 'tum', 'CG8173', 'aurB', 'feo',
#        'pav', 'CG6767', 'sip2', 'Det', 'Cks30A', 'CycB', 'B52']

cell_cycle_genes = ['PCNA', 'dnk', 'RnrS', 'RnrL', 'Claspin', 'Mcm5', 'Caf1-180',
       'RPA2', 'HipHop', 'stg', 'Mcm6', 'dup', 'WRNexo', 'Mcm7', 'dpa',
       'CG10336', 'Mcm3', 'Mcm2', 'RpA-70', 'Chrac-14', 'CG13690', 'RPA3',
       'asf1', 'DNApol-alpha73', 'CycE', 'DNApol-alpha50',
       'Kmn1', 'Lam', 'Nph', 'msd5', 'msd1', 'ctp', 'Set', 'scra',
       'Chrac-16', 'ncd', 'Ote', 'pzg', 'HDAC1', 'nesd', 'tum', 'CG8173',
       'aurB', 'feo', 'pav', 'CG6767', 'sip2', 'Det', 'Cks30A', 'CycB',
       'B52']

# list of genes we're interested in
genes_of_interest = [
    "Argk",
    "Nrt",
    "Ten-a",
    "Ten-m",
    "wb",
    "Act57B",
    "drl",
    "mid",
    "nemy",
    "lms",
    "CG11835",
    "Gyg",
    "ara",
    "tok",
    "kirre",
    "NK7.1",
    "fj",
    "beat-IIIc",
    "CG33993",
    "dpr16",
    "CG15529",
    "CG9593",
    "beat-IIb",
    "robo2",
    "Ama",
    "fz2",
    "elB",
    "noc",
    "nkd",
    "fng",
    "vg",
]

nuisance_genes = sex_genes + cell_cycle_genes

In [7]:

# setting up anndata for our regular scvi model
adata = anndata.read('data/myoblasts.h5ad')
adata.layers['counts'] = adata.X.copy() # move count data into a layer
sc.pp.normalize_total(adata)
sc.pp.log1p(adata) # log the data for better umap visualization later
adata.raw = adata
scvi.data.setup_anndata(adata, batch_key ='batch', layer='counts')

# setting up anndata for scvi with conditioning on nuisance genes
adata_cond = adata.copy()


# then copy the expression of each nuisance gene into adata.obs where the key
# is the gene name
for g in nuisance_genes:
    exp = adata_cond[:,g].X
    adata_cond.obs[g] = exp.copy()

# finally, remove the nuisance genes from the anndata
gene_subset = [g for g in adata.var_names if g not in nuisance_genes]
adata_cond = adata_cond[:,gene_subset].copy()

# run setup_anndata with our list of nuisance genes as our continuous covariates
scvi.data.setup_anndata(adata_cond,
                        batch_key = 'batch',
                        continuous_covariate_keys=nuisance_genes, 
                        layer='counts')
# setting up anndata for our regular scvi model
adata = anndata.read('data/myoblasts.h5ad')
adata.layers['counts'] = adata.X.copy() # move count data into a layer
sc.pp.normalize_total(adata)
sc.pp.log1p(adata) # log the data for better umap visualization later
adata.raw = adata
scvi.data.setup_anndata(adata, batch_key ='batch', layer='counts')

# setting up anndata for scvi with conditioning on nuisance genes
adata_cond = adata.copy()


# then copy the expression of each nuisance gene into adata.obs where the key
# is the gene name
for g in nuisance_genes:
    exp = adata_cond[:,g].X
    adata_cond.obs[g] = exp.copy()

# finally, remove the nuisance genes from the anndata
gene_subset = [g for g in adata.var_names if g not in nuisance_genes]
adata_cond = adata_cond[:,gene_subset].copy()

# run setup_anndata with our list of nuisance genes as our continuous covariates
scvi.data.setup_anndata(adata_cond,
                        batch_key = 'batch',
                        continuous_covariate_keys=nuisance_genes, 
                        layer='counts')

INFO     Using batches from adata.obs["batch"]                                               
INFO     No label_key inputted, assuming all cells have same label                           
INFO     Using data from adata.layers["counts"]                                              
INFO     Computing library size prior per batch                                              
INFO     Successfully registered anndata object containing 19885 cells, 3387 vars, 4 batches,
         1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra
         continuous covariates.                                                              
INFO     Please do not further modify adata until model is trained.                          
INFO     Using batches from adata.obs["batch"]                                               
INFO     No label_key inputted, assuming all cells have same label                           
INFO     Using data from adata.layers["counts"]                                              
INFO     Computing library size prior per batch                                              
INFO     Successfully registered anndata object containing 19885 cells, 3332 vars, 4 batches,
         1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 55     
         extra continuous covariates.                                                        
INFO     Please do not further modify adata until model is trained.

scVI with no extra covariates¶

Training¶

Here we setup and train scvi only on batch covariates. In this dataset, we have four batches.

In [8]:

scvi_model = scvi.model.SCVI(adata)
scvi_model = scvi.model.SCVI(adata)

In [9]:

scvi_model.train()
scvi_model.train()

GPU available: True, used: True
TPU available: None, using: 0 TPU cores

Epoch 400/400: 100%|██████████| 400/400 [06:22<00:00,  1.04it/s, loss=2.64e+03, v_num=1]

Visualization¶

Here we visualize the latent space of our trained scvi model.

Notice the model integrates our batches rather well. However, we see the latent space splitting along cell sex and cell cycle confounding the representation of genes we are particularly interested in like Nrt, ptc, and mid.

In [10]:

adata.obsm['X_scvi'] = scvi_model.get_latent_representation(adata)
sc.pp.neighbors(adata, use_rep = 'X_scvi')
sc.tl.umap(adata)
adata.obsm['X_scvi'] = scvi_model.get_latent_representation(adata)
sc.pp.neighbors(adata, use_rep = 'X_scvi')
sc.tl.umap(adata)

Here we see scVI do a good job correcting for batch effects.

In [11]:

sc.pl.umap(adata, 
           color='batch', 
           use_raw=False, 
           frameon=False)
sc.pl.umap(adata, 
           color='batch', 
           use_raw=False, 
           frameon=False)

However, if we look at cell sex markers like rox1 and rox2, we see that cell sex is a strong confounder in our data.

In [12]:

sc.pl.umap(adata, color=["lncRNA:roX1", "lncRNA:roX2"], frameon=False)
sc.pl.umap(adata, color=["lncRNA:roX1", "lncRNA:roX2"], frameon=False)

Likewise, if we look at a cell cycle marker like PCNA, we see that cell cycle has a major effect on our latent represenation.

In [13]:

sc.pl.umap(adata, color="PCNA", frameon=False)
sc.pl.umap(adata, color="PCNA", frameon=False)

In [14]:

fig = sc.pl.umap(
    adata,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_no_correct_umap.pdf", bbox_inches="tight", dpi=DPI)
fig = sc.pl.umap(
    adata,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_no_correct_umap.pdf", bbox_inches="tight", dpi=DPI)

The effects of cell cycle and cell sex ends up affecting the represenation of genes we are interested in like Nrt, ptc, and mid. We want to visualize these genes independant of cell sex and cell cycle, which will motivate our next section.

In [15]:

sc.pl.umap(adata, color=["Nrt", "ptc", "mid"], frameon=False)
sc.pl.umap(adata, color=["Nrt", "ptc", "mid"], frameon=False)

In [16]:

sc.pl.umap(
    adata,
    color=["batch", "lncRNA:roX1", "lncRNA:roX2", "PCNA", "Nrt", "ptc", "mid", "fng"],
    frameon=False,
)
sc.pl.umap(
    adata,
    color=["batch", "lncRNA:roX1", "lncRNA:roX2", "PCNA", "Nrt", "ptc", "mid", "fng"],
    frameon=False,
)

scVI with extra covariates¶

In this second part of the tutorial, we train scVI on extra categorical covariates.

In our case, we would like to condition our latent represenation on cell cycle and cell sex gene expression in order to learn a latent representation that is independant of cell sex and cell cycle.

Training¶

In [17]:

cond_model = scvi.model.SCVI(adata_cond)
cond_model.train()
cond_model = scvi.model.SCVI(adata_cond)
cond_model.train()

GPU available: True, used: True
TPU available: None, using: 0 TPU cores

Epoch 400/400: 100%|██████████| 400/400 [06:39<00:00,  1.00it/s, loss=2.54e+03, v_num=1]

Visualization¶

Here we visualize the latent space of our scVI model trained with expression from our nuisance genes as a continuous covariate.

In [18]:

adata_cond.obsm['X_scvi'] = cond_model.get_latent_representation()
sc.pp.neighbors(adata_cond, use_rep = 'X_scvi')
sc.tl.umap(adata_cond)
adata_cond.obsm['X_scvi'] = cond_model.get_latent_representation()
sc.pp.neighbors(adata_cond, use_rep = 'X_scvi')
sc.tl.umap(adata_cond)

Again, we see scVI do a good job correcting for batch effects.

In [19]:

sc.pl.umap(adata_cond, color="batch", use_raw=True, frameon=False)
sc.pl.umap(adata_cond, color="batch", use_raw=True, frameon=False)

Previously we saw strong signal from cell sex markers, rox1 and rox2, in our latent space. Now that we condition on the cell sex genes, that signal has been removed.

In [20]:

sc.pl.umap(adata_cond, color=["lncRNA:roX1", "lncRNA:roX2"], frameon=False)
sc.pl.umap(adata_cond, color=["lncRNA:roX1", "lncRNA:roX2"], frameon=False)

Likewise, if we look at a cell cycle marker PCNA, we now no longer see the effects of cell cycle in our latent space.

In [21]:

sc.pl.umap(adata_cond, color="PCNA", frameon=False)
sc.pl.umap(adata_cond, color="PCNA", frameon=False)

Finally, we look at the represenation of genes we are interested in like Nrt, ptc, and mid. With the signal from cell sex and cell cycle removed from the latent space, the representation of genes we are interested in looks much better!

In [22]:

sc.pl.umap(
    adata_cond, color=["Nrt", "ptc", "mid", "Ama", "vg", "ct", "fng"], frameon=False
)
sc.pl.umap(
    adata_cond, color=["Nrt", "ptc", "mid", "Ama", "vg", "ct", "fng"], frameon=False
)

In [23]:

sc.pl.umap(
    adata_cond,
    color=["batch", "lncRNA:roX1", "lncRNA:roX2", "PCNA", "Nrt", "ptc", "mid", "fng"],
    frameon=False,
)
sc.pl.umap(
    adata_cond,
    color=["batch", "lncRNA:roX1", "lncRNA:roX2", "PCNA", "Nrt", "ptc", "mid", "fng"],
    frameon=False,
)

In [24]:

fig = sc.pl.umap(
    adata_cond,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_with_correct_umap.pdf", bbox_inches="tight", dpi=DPI)
fig = sc.pl.umap(
    adata_cond,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_with_correct_umap.pdf", bbox_inches="tight", dpi=DPI)

Scanpy¶

But first we run scanpy's regress_out() as a baseline.

In [25]:

# takes ~ 15 min to run 
adata_regress_out = sc.pp.regress_out(adata_cond, nuisance_genes, copy=True)
adata_regress_out_harmony = adata_regress_out.copy()
# takes ~ 15 min to run 
adata_regress_out = sc.pp.regress_out(adata_cond, nuisance_genes, copy=True)
adata_regress_out_harmony = adata_regress_out.copy()

bbknn¶

In [26]:

sc.pp.scale(adata_regress_out)
sc.tl.pca(adata_regress_out)
sc.external.pp.bbknn(adata_regress_out)
sc.tl.umap(adata_regress_out)
sc.pp.scale(adata_regress_out)
sc.tl.pca(adata_regress_out)
sc.external.pp.bbknn(adata_regress_out)
sc.tl.umap(adata_regress_out)

If we visualize the output of regress out, it doesn't perform any batch correction.

In [27]:

sc.pl.umap(adata_regress_out, 
           color='batch', 
           use_raw=False, 
           frameon=False)
sc.pl.umap(adata_regress_out, 
           color='batch', 
           use_raw=False, 
           frameon=False)

Within a batch, regress out corrects for cell cycle and cell sex expression.

In [28]:

sc.pl.umap(
    adata_regress_out,
    color=[
        "lncRNA:roX1",
        "lncRNA:roX2",
        "PCNA",
        "vg",
    ],
    frameon=False,
)
sc.pl.umap(
    adata_regress_out,
    color=[
        "lncRNA:roX1",
        "lncRNA:roX2",
        "PCNA",
        "vg",
    ],
    frameon=False,
)

However, we can see that it doesn't do a great job for genes we're interested in.

In [29]:

sc.pl.umap(
    adata_regress_out, color=["Nrt", "ptc", "mid", "Ama", "ct", "fng"], frameon=False
)
sc.pl.umap(
    adata_regress_out, color=["Nrt", "ptc", "mid", "Ama", "ct", "fng"], frameon=False
)

In [30]:

fig = sc.pl.umap(
    adata_regress_out,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_regress_umap.pdf", bbox_inches="tight", dpi=DPI)
fig = sc.pl.umap(
    adata_regress_out,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_regress_umap.pdf", bbox_inches="tight", dpi=DPI)

harmony¶

In [31]:

sc.pp.scale(adata_regress_out_harmony)
sc.tl.pca(adata_regress_out_harmony)
sc.external.pp.harmony_integrate(adata_regress_out_harmony, key="batch")
sc.pp.neighbors(adata_regress_out_harmony, use_rep="X_pca_harmony")
sc.tl.umap(adata_regress_out_harmony)
sc.pp.scale(adata_regress_out_harmony)
sc.tl.pca(adata_regress_out_harmony)
sc.external.pp.harmony_integrate(adata_regress_out_harmony, key="batch")
sc.pp.neighbors(adata_regress_out_harmony, use_rep="X_pca_harmony")
sc.tl.umap(adata_regress_out_harmony)

2021-10-13 10:37:47,350 - harmonypy - INFO - Iteration 1 of 10
2021-10-13 10:37:50,444 - harmonypy - INFO - Iteration 2 of 10
2021-10-13 10:37:53,653 - harmonypy - INFO - Converged after 2 iterations

In [32]:

fig = sc.pl.umap(
    adata_regress_out_harmony,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_regress_umap_harmony.pdf", bbox_inches="tight", dpi=DPI)
fig = sc.pl.umap(
    adata_regress_out_harmony,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_regress_umap_harmony.pdf", bbox_inches="tight", dpi=DPI)

Seurat v3¶

In [33]:

adata_regress_out_seurat = adata_regress_out_harmony.copy()
df = pd.read_csv("data/myoblast_regressout_seurat_pca.csv", index_col=0)
adata_regress_out_seurat.obsm["X_pca_seurat"] = df.values
sc.pp.neighbors(adata_regress_out_seurat, use_rep="X_pca_seurat")
sc.tl.umap(adata_regress_out_seurat)
adata_regress_out_seurat = adata_regress_out_harmony.copy()
df = pd.read_csv("data/myoblast_regressout_seurat_pca.csv", index_col=0)
adata_regress_out_seurat.obsm["X_pca_seurat"] = df.values
sc.pp.neighbors(adata_regress_out_seurat, use_rep="X_pca_seurat")
sc.tl.umap(adata_regress_out_seurat)

In [34]:

fig = sc.pl.umap(
    adata_regress_out_seurat,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_regress_umap_seurat.pdf", bbox_inches="tight", dpi=DPI)
fig = sc.pl.umap(
    adata_regress_out_seurat,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_regress_umap_seurat.pdf", bbox_inches="tight", dpi=DPI)

Nothing¶

In [35]:

adata_nothing = adata.copy()
sc.pp.scale(adata_nothing)
sc.tl.pca(adata_nothing)
sc.pp.neighbors(adata_nothing)
sc.tl.umap(adata_nothing)
adata_nothing = adata.copy()
sc.pp.scale(adata_nothing)
sc.tl.pca(adata_nothing)
sc.pp.neighbors(adata_nothing)
sc.tl.umap(adata_nothing)

In [36]:

fig = sc.pl.umap(
    adata_nothing,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_regress_umap_nothing.pdf", bbox_inches="tight", dpi=DPI)
fig = sc.pl.umap(
    adata_nothing,
    color=["batch", "PCNA", "lncRNA:roX1", "vg"],
    frameon=False,
    wspace=0.25,
    return_fig=True,
)
fig.savefig("figs/dros_regress_umap_nothing.pdf", bbox_inches="tight", dpi=DPI)

Geary's C Analysis¶

Here we calculate Geary's C for our nuisance genes and genes of interest of our two models.

Calculating Geary's C

In [37]:

genes = nuisance_genes + genes_of_interest
# calculate Geary's C for regular scvi model
vanilla_gc = gearys_c(adata, vals=adata.raw[:, genes].X.T)

# calculate Geary's C for scvi + conditioning
cond_gc = gearys_c(adata_cond, vals=adata_cond.raw[:, genes].X.T)

# calculate Geary's C for regress_out bbknn
regress_out_gc = gearys_c(adata_regress_out, vals=adata_regress_out.raw[:, genes].X.T)

# calculate Geary's C for regress_out harmony
regress_out_gc_harmony = gearys_c(
    adata_regress_out_harmony, vals=adata_regress_out.raw[:, genes].X.T
)

# calculate Geary's C for regress_out seurat
regress_out_gc_seurat = gearys_c(
    adata_regress_out_seurat, vals=adata_regress_out.raw[:, genes].X.T
)

# calculate Geary's C for no correction
regress_out_gc_nothing = gearys_c(
    adata_nothing, vals=adata_regress_out.raw[:, genes].X.T
)
genes = nuisance_genes + genes_of_interest
# calculate Geary's C for regular scvi model
vanilla_gc = gearys_c(adata, vals=adata.raw[:, genes].X.T)

# calculate Geary's C for scvi + conditioning
cond_gc = gearys_c(adata_cond, vals=adata_cond.raw[:, genes].X.T)

# calculate Geary's C for regress_out bbknn
regress_out_gc = gearys_c(adata_regress_out, vals=adata_regress_out.raw[:, genes].X.T)

# calculate Geary's C for regress_out harmony
regress_out_gc_harmony = gearys_c(
    adata_regress_out_harmony, vals=adata_regress_out.raw[:, genes].X.T
)

# calculate Geary's C for regress_out seurat
regress_out_gc_seurat = gearys_c(
    adata_regress_out_seurat, vals=adata_regress_out.raw[:, genes].X.T
)

# calculate Geary's C for no correction
regress_out_gc_nothing = gearys_c(
    adata_nothing, vals=adata_regress_out.raw[:, genes].X.T
)

In [38]:

gc = pd.DataFrame(
    data=np.concatenate(
        (
            vanilla_gc,
            cond_gc,
            regress_out_gc,
            regress_out_gc_harmony,
            regress_out_gc_seurat,
            regress_out_gc_nothing,
        )
    ),
    columns=["Geary's C"],
    index=genes * 6,
)
gc["model"] = (
    ["scVI"] * len(genes)
    + ["scVI-cc"] * len(genes)
    + ["Regress-bbknn"] * len(genes)
    + ["Regress-harmony"] * len(genes)
    + ["Regress-seurat"] * len(genes)
    + ["No correction"] * len(genes)
)
is_nuisance = [True] * len(nuisance_genes) + [False] * len(genes_of_interest)
gc["is_nuisance"] = is_nuisance * 6
gc = pd.DataFrame(
    data=np.concatenate(
        (
            vanilla_gc,
            cond_gc,
            regress_out_gc,
            regress_out_gc_harmony,
            regress_out_gc_seurat,
            regress_out_gc_nothing,
        )
    ),
    columns=["Geary's C"],
    index=genes * 6,
)
gc["model"] = (
    ["scVI"] * len(genes)
    + ["scVI-cc"] * len(genes)
    + ["Regress-bbknn"] * len(genes)
    + ["Regress-harmony"] * len(genes)
    + ["Regress-seurat"] * len(genes)
    + ["No correction"] * len(genes)
)
is_nuisance = [True] * len(nuisance_genes) + [False] * len(genes_of_interest)
gc["is_nuisance"] = is_nuisance * 6

Now we can compare the Geary's C of genes between our models. A Geary's C close to 1 means the expression is uncorrelated with spatial position while values closer to 0 means the expression is positively correlated with spatial position.

For genes that we condition on, we would expect to see Geary's C values closer to 1. This means that genes expression is uncorrelated with the spatial position in the latent space.

In [39]:

non_nuisance_gc =  gc[gc['is_nuisance']==False]
nuisance_gc = gc[gc['is_nuisance']==True]
non_nuisance_gc =  gc[gc['is_nuisance']==False]
nuisance_gc = gc[gc['is_nuisance']==True]

In the below plot, we see that conditioning on cell cycle and cell sex genes increases the Geary's C of those genes compared to not conditioning.

In [55]:

from statannotations.Annotator import Annotator

order = ["scVI", "scVI-cc", "Regress-bbknn", "Regress-harmony", "Regress-seurat"]

fig, ax = plt.subplots()
sns.swarmplot(
    y="Geary's C",
    x="model",
    data=nuisance_gc,
    order=order,
    palette="flare",
    ax=ax,
)
sns.boxplot(
    x="model",
    y="Geary's C",
    data=nuisance_gc,
    order=order, 
    boxprops={"facecolor": "None"},
    ax=ax,
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_ylabel("Geary's C (higher better)")
ax.set_title("Cell cycle and cell sex genes")

pairs=[("scVI-cc", "Regress-bbknn"), ("scVI-cc", "Regress-harmony"), ("scVI-cc", "Regress-seurat")]

annotator = Annotator(ax, pairs, data=nuisance_gc, x="model", y="Geary's C", order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()

fig.savefig("figs/boxplot_geary1.pdf", bbox_inches="tight")
from statannotations.Annotator import Annotator

order = ["scVI", "scVI-cc", "Regress-bbknn", "Regress-harmony", "Regress-seurat"]

fig, ax = plt.subplots()
sns.swarmplot(
    y="Geary's C",
    x="model",
    data=nuisance_gc,
    order=order,
    palette="flare",
    ax=ax,
)
sns.boxplot(
    x="model",
    y="Geary's C",
    data=nuisance_gc,
    order=order, 
    boxprops={"facecolor": "None"},
    ax=ax,
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_ylabel("Geary's C (higher better)")
ax.set_title("Cell cycle and cell sex genes")

pairs=[("scVI-cc", "Regress-bbknn"), ("scVI-cc", "Regress-harmony"), ("scVI-cc", "Regress-seurat")]

annotator = Annotator(ax, pairs, data=nuisance_gc, x="model", y="Geary's C", order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()

fig.savefig("figs/boxplot_geary1.pdf", bbox_inches="tight")

/home/adam/.pyenv/versions/scvi-tools-paper/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 5.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
/home/adam/.pyenv/versions/scvi-tools-paper/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 41.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
/home/adam/.pyenv/versions/scvi-tools-paper/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 49.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
/home/adam/.pyenv/versions/scvi-tools-paper/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 29.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
/home/adam/.pyenv/versions/scvi-tools-paper/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 34.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

scVI-cc vs. Regress-bbknn: Mann-Whitney-Wilcoxon test two-sided, P_val:1.323e-06 U_stat=7.030e+02
scVI-cc vs. Regress-harmony: Mann-Whitney-Wilcoxon test two-sided, P_val:3.362e-11 U_stat=2.622e+03
scVI-cc vs. Regress-seurat: Mann-Whitney-Wilcoxon test two-sided, P_val:1.173e-01 U_stat=1.775e+03

We can also run a Wilcoxon rank-sum test to show that the difference in Geary's C between the models is statistically significant.

In [41]:

scvi_gc = nuisance_gc[nuisance_gc['model'] == 'scVI']["Geary's C"]
covariate_gc = nuisance_gc[nuisance_gc['model'] == 'scVI-cc']["Geary's C"]
regress_gc = nuisance_gc[nuisance_gc['model'] == 'Scanpy']["Geary's C"]

print(ranksums(scvi_gc,covariate_gc))
print(ranksums(scvi_gc, regress_gc))
scvi_gc = nuisance_gc[nuisance_gc['model'] == 'scVI']["Geary's C"]
covariate_gc = nuisance_gc[nuisance_gc['model'] == 'scVI-cc']["Geary's C"]
regress_gc = nuisance_gc[nuisance_gc['model'] == 'Scanpy']["Geary's C"]

print(ranksums(scvi_gc,covariate_gc))
print(ranksums(scvi_gc, regress_gc))

RanksumsResult(statistic=-8.671299312233483, pvalue=4.272170649685279e-18)
RanksumsResult(statistic=nan, pvalue=nan)

/home/adam/.pyenv/versions/scvi-tools-paper/lib/python3.8/site-packages/scipy/stats/stats.py:7246: RuntimeWarning: invalid value encountered in double_scalars

If we look at select genes we didn't condition on, we see that the Geary's C doesn't really change between the two models.

In [56]:

fig, ax = plt.subplots()

order = ["scVI", "scVI-cc", "Regress-bbknn", "Regress-harmony", "Regress-seurat"]

sns.swarmplot(
    y="Geary's C",
    x="model",
    data=non_nuisance_gc,
    order=order,
    palette="flare",
    ax=ax
)
sns.boxplot(
    x="model",
    y="Geary's C",
    data=non_nuisance_gc,
    order=order,
#     showcaps=False,
    boxprops={"facecolor": "None"},
#     showfliers=False,
    ax=ax
)

annotator = Annotator(ax, pairs, data=non_nuisance_gc, x="model", y="Geary's C", order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()


ax.set_ylabel("Geary's C (lower better)")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_title("Marker genes")
fig.savefig("figs/boxplot_geary2.pdf", bbox_inches="tight")
fig, ax = plt.subplots()

order = ["scVI", "scVI-cc", "Regress-bbknn", "Regress-harmony", "Regress-seurat"]

sns.swarmplot(
    y="Geary's C",
    x="model",
    data=non_nuisance_gc,
    order=order,
    palette="flare",
    ax=ax
)
sns.boxplot(
    x="model",
    y="Geary's C",
    data=non_nuisance_gc,
    order=order,
#     showcaps=False,
    boxprops={"facecolor": "None"},
#     showfliers=False,
    ax=ax
)

annotator = Annotator(ax, pairs, data=non_nuisance_gc, x="model", y="Geary's C", order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()


ax.set_ylabel("Geary's C (lower better)")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_title("Marker genes")
fig.savefig("figs/boxplot_geary2.pdf", bbox_inches="tight")

scVI-cc vs. Regress-bbknn: Mann-Whitney-Wilcoxon test two-sided, P_val:4.714e-02 U_stat=3.390e+02
scVI-cc vs. Regress-harmony: Mann-Whitney-Wilcoxon test two-sided, P_val:2.783e-01 U_stat=5.580e+02
scVI-cc vs. Regress-seurat: Mann-Whitney-Wilcoxon test two-sided, P_val:9.585e-03 U_stat=2.960e+02

We can also run a Wilcoxon rank-sum test to show that the difference in Geary's C between the models is not statistically significant.

In [43]:

scvi_gc = non_nuisance_gc[non_nuisance_gc['model'] == 'scVI']["Geary's C"]
covariate_gc = non_nuisance_gc[non_nuisance_gc['model'] == 'scVI-cc']["Geary's C"]
regress_gc = non_nuisance_gc[non_nuisance_gc['model'] == 'Regress-bbknn']["Geary's C"]

print(ranksums(scvi_gc,covariate_gc))
print(ranksums(scvi_gc, regress_gc))
scvi_gc = non_nuisance_gc[non_nuisance_gc['model'] == 'scVI']["Geary's C"]
covariate_gc = non_nuisance_gc[non_nuisance_gc['model'] == 'scVI-cc']["Geary's C"]
regress_gc = non_nuisance_gc[non_nuisance_gc['model'] == 'Regress-bbknn']["Geary's C"]

print(ranksums(scvi_gc,covariate_gc))
print(ranksums(scvi_gc, regress_gc))

RanksumsResult(statistic=-0.4997893615082406, pvalue=0.6172234022803302)
RanksumsResult(statistic=-2.231453909832567, pvalue=0.02565107727959247)

In [ ]: