Uncertainty analysis - Pancreas#
Library imports#
import os
import sys
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import mplscience
import seaborn as sns
import scanpy as sc
import scvelo as scv
import scvi
from scvelo.plotting.simulation import compute_dynamics
from velovi import preprocess_data, VELOVI
from velovi._model import _compute_directional_statistics_tensor
sys.path.append("../..")
from paths import DATA_DIR, FIG_DIR
Global seed set to 0
General settings#
scvi.settings.dl_pin_memory_gpu_training = False
sns.reset_defaults()
sns.reset_orig()
scv.settings.set_figure_params('scvelo', dpi_save=400, dpi=80, transparent=True, fontsize=20, color_map='viridis')
SAVE_FIGURES = True
if SAVE_FIGURES:
os.makedirs(FIG_DIR / 'uncertainty' / 'pancreas', exist_ok=True)
Function definition#
def fit_velovi(bdata):
VELOVI.setup_anndata(bdata, spliced_layer="Ms", unspliced_layer="Mu")
vae = VELOVI(bdata)
vae.train()
df = vae.history["elbo_train"].iloc[20:].reset_index().rename(columns={'elbo_train': 'elbo'})
df['set'] = 'train'
_df = vae.history["elbo_validation"].iloc[20:].reset_index().rename(columns={'elbo_validation': 'elbo'})
_df['set'] = 'validation'
df = pd.concat([df, _df], axis=0).reset_index(drop=True)
with mplscience.style_context():
sns.set_style(style="whitegrid")
fig, ax = plt.subplots(figsize=(6, 4))
sns.lineplot(data=df, x='epoch', y='elbo', hue='set', palette=['#0173B2', '#DE8F05'], ax=ax)
latent_time = vae.get_latent_time(n_samples=25)
velocities = vae.get_velocity(n_samples=25, velo_statistic="mean")
t = latent_time
scaling = 20 / t.max(0)
bdata.layers["velocities_velovi"] = velocities / scaling
bdata.layers["latent_time_velovi"] = latent_time
bdata.var["fit_alpha"] = vae.get_rates()["alpha"] / scaling
bdata.var["fit_beta"] = vae.get_rates()["beta"] / scaling
bdata.var["fit_gamma"] = vae.get_rates()["gamma"] / scaling
bdata.var["fit_t_"] = (
torch.nn.functional.softplus(vae.module.switch_time_unconstr)
.detach()
.cpu()
.numpy()
) * scaling
bdata.layers["fit_t"] = latent_time.values * scaling[np.newaxis, :]
bdata.var['fit_scaling'] = 1.0
return vae
def compute_sign_variance(adata, vae):
v_stack = vae.get_velocity(n_samples=50, velo_statistic="mean", return_mean=False)
pos_freq = (v_stack >= 0).mean(0)
# neg_freq = (v_stack < 0).mean(0)
adata.layers["velocity"] = v_stack.mean(0)
var_freq = pos_freq * (1 - pos_freq)
adata.obs["sign_var"] = var_freq.mean(1)
adata.layers["sign_var"] = var_freq
adata.layers["variance"] = v_stack.var(0)
def compute_sign_var_score(adata, labels_key, vae):
compute_sign_variance(adata, vae)
sign_var_df = adata.to_df("sign_var")
expr_df = adata.to_df("Ms")
prod_df = sign_var_df * np.abs(expr_df)
prod_df[labels_key] = adata.obs[labels_key]
prod_df = prod_df.groupby(labels_key).mean()
sign_var_df[labels_key] = adata.obs[labels_key]
sign_var_df = sign_var_df.groupby(labels_key).mean()
# max over clusters for a gene
return sign_var_df.mean(0)
def gene_rank(adata, vkey="velocities_velovi"):
from scipy.stats import rankdata
scv.tl.velocity_graph(adata, vkey=vkey)
tm = scv.utils.get_transition_matrix(
adata, vkey=vkey, use_negative_cosines=True, self_transitions=True
)
tm.setdiag(0)
adata.layers["Ms_extrap"] = tm @ adata.layers["Ms"]
adata.layers["Ms_delta"] = adata.layers["Ms_extrap"] - adata.layers["Ms"]
prod = adata.layers["Ms_delta"] * adata.layers[vkey]
ranked = rankdata(prod, axis=1)
adata.layers["product_score"] = prod
adata.layers["ranked_score"] = ranked
def plot_phase_portrait(adata, gene, color, figsize=(6, 6)):
fig, ax = plt.subplots(figsize=figsize)
df = pd.DataFrame(
{
'unspliced': adata[:, gene].layers['Mu'].squeeze().copy(),
'spliced': adata[:, gene].layers['Ms'].squeeze().copy(),
'color': color
}
)
with mplscience.style_context():
sns.scatterplot(data=df, x='spliced', y='unspliced', c=color, s=25, ax=ax);
_, unspliced, spliced = compute_dynamics(adata, basis=gene, extrapolate=True, sort=True)
df = pd.DataFrame(
{
'unspliced': unspliced.squeeze(),
'spliced': spliced.squeeze(),
}
)
ax.plot(spliced, unspliced, color="purple", linewidth=2)
spliced_steady_state = np.linspace(np.min(spliced), np.max(spliced))
unspliced_steady_state = adata.var.loc[gene, 'fit_gamma'] / adata.var.loc[gene, 'fit_beta'] * (spliced_steady_state - np.min(spliced_steady_state)) + np.min(unspliced)
ax.plot(spliced_steady_state, unspliced_steady_state, color='purple', linestyle="--", linewidth=2);
ax.axis('off')
if SAVE_FIGURES:
fig.savefig(
FIG_DIR / 'uncertainty' / 'pancreas' / f'phase_portrait_{gene}.svg',
format="svg",
transparent=True,
bbox_inches='tight'
)
Data loading#
adata = scv.datasets.pancreas(DATA_DIR / "pancreas" / "endocrinogenesis_day15.h5ad")
adata
AnnData object with n_obs × n_vars = 3696 × 27998
obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score'
var: 'highly_variable_genes'
uns: 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
obsm: 'X_pca', 'X_umap'
layers: 'spliced', 'unspliced'
obsp: 'distances', 'connectivities'
scv.pl.scatter(adata, basis='umap', c='clusters', dpi=200)
Data preprocessing#
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
adata = preprocess_data(adata)
Filtered out 20801 genes that are detected 20 counts (shared).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
Logarithmized X.
computing neighbors
finished (0:00:11) --> added
'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
finished (0:00:00) --> added
'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
computing velocities
finished (0:00:00) --> added
'velocity', velocity vectors for each individual cell (adata.layers)
Model training#
vae = fit_velovi(adata)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Epoch 500/500: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [04:32<00:00, 1.83it/s, loss=-2.65e+03, v_num=1]
scv.tl.velocity_graph(adata, vkey="velocities_velovi", sqrt_transform=False)
scv.tl.velocity_embedding(
adata, vkey="velocities_velovi", use_negative_cosines=True, self_transitions=True
)
computing velocity graph (using 1/14 cores)
finished (0:00:08) --> added
'velocities_velovi_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
finished (0:00:00) --> added
'velocities_velovi_umap', embedded velocity vectors (adata.obsm)
with mplscience.style_context():
fig, ax = plt.subplots(figsize=(6, 4))
scv.pl.velocity_embedding_stream(
adata, vkey="velocities_velovi", color=["clusters"], cmap="viridis", legend_loc=False, title='', ax=ax
)
if SAVE_FIGURES:
fig.savefig(
FIG_DIR / 'uncertainty' / 'pancreas' / 'velocity_stream.svg',
format="svg",
transparent=True,
bbox_inches='tight'
)
Uncertainty#
Intrinsic#
udf, _ = vae.get_directional_uncertainty(n_samples=100)
INFO velovi: Sampling from model...
INFO velovi: Computing the uncertainties...
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 14 concurrent workers.
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
[Parallel(n_jobs=-1)]: Done 4 tasks | elapsed: 3.5s
[Parallel(n_jobs=-1)]: Done 152 tasks | elapsed: 4.0s
[Parallel(n_jobs=-1)]: Done 2158 tasks | elapsed: 6.6s
[Parallel(n_jobs=-1)]: Done 3696 out of 3696 | elapsed: 8.5s finished
for c in udf.columns:
adata.obs[c] = np.log10(udf[c].values)
with mplscience.style_context():
fig, ax = plt.subplots(figsize=(6, 4))
scv.pl.umap(adata, color='directional_cosine_sim_variance', perc=[5, 95], title='', cmap="Greys", ax=ax)
if SAVE_FIGURES:
fig.savefig(
FIG_DIR / 'uncertainty' / 'pancreas' / 'directional_cosine_sim_variance_initrinsic.svg',
format="svg",
transparent=True,
bbox_inches='tight'
)
Extrinsic#
extrapolated_cells_list = []
for i in range(25):
vkey = "velocities_velovi_{i}".format(i=i)
v = vae.get_velocity(n_samples=1, velo_statistic="mean")
adata.layers[vkey] = v
scv.tl.velocity_graph(adata, vkey=vkey, sqrt_transform=False, approx=True)
t_mat = scv.utils.get_transition_matrix(
adata, vkey=vkey, self_transitions=True, use_negative_cosines=True
)
extrapolated_cells = np.asarray(t_mat @ adata.layers["Ms"])
extrapolated_cells_list.append(extrapolated_cells)
extrapolated_cells = np.stack(extrapolated_cells_list)
computing velocity graph (using 1/14 cores)
finished (0:00:03) --> added
'velocities_velovi_0_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_1_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_2_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_3_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_4_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_5_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_6_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_7_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_8_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_9_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_10_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_11_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_12_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_13_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_14_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_15_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_16_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_17_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_18_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_19_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_20_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_21_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_22_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_23_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity graph (using 1/14 cores)
finished (0:00:02) --> added
'velocities_velovi_24_graph', sparse matrix with cosine correlations (adata.uns)
df, _ = _compute_directional_statistics_tensor(extrapolated_cells, n_jobs=4, n_cells=adata.n_obs)
INFO velovi: Computing the uncertainties...
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
[Parallel(n_jobs=4)]: Done 24 tasks | elapsed: 3.2s
[Parallel(n_jobs=4)]: Done 3480 tasks | elapsed: 5.6s
[Parallel(n_jobs=4)]: Done 3696 out of 3696 | elapsed: 5.7s finished
for c in df.columns:
adata.obs[c + "_extrinsic"] = np.log10(df[c].values)
with mplscience.style_context():
fig, ax = plt.subplots(figsize=(6, 4))
scv.pl.umap(adata, color='directional_cosine_sim_variance_extrinsic', perc=[5, 95], cmap='viridis', ax=ax)
if SAVE_FIGURES:
fig.savefig(
FIG_DIR / 'uncertainty' / 'pancreas' / 'directional_cosine_sim_variance_extrinsic.svg',
format="svg",
transparent=True,
bbox_inches='tight'
)
Velocity coherence#
sign_score = compute_sign_var_score(adata, 'clusters', vae)
gene_rank(adata)
computing velocity graph (using 1/14 cores)
finished (0:00:08) --> added
'velocities_velovi_graph', sparse matrix with cosine correlations (adata.uns)
alpha_cells = adata.obs.query("clusters == 'Alpha'").index
alpha_cluster_data = adata[alpha_cells]
alpha_cluster_data.obs['mean_product_score_per_cell_alpha'] = alpha_cluster_data.layers['product_score'].mean(axis=1)
alpha_cluster_data.var['mean_product_score_per_gene_alpha'] = alpha_cluster_data.layers['product_score'].mean(axis=0)
ngn3_high_cells = adata.obs.query("clusters == 'Ngn3 high EP'").index
ngn3_high_cluster_data = adata[ngn3_high_cells]
ngn3_high_cluster_data.obs['mean_product_score_per_cell_ngn3_high_ep'] = ngn3_high_cluster_data.layers['product_score'].mean(axis=1)
ngn3_high_cluster_data.var['mean_product_score_per_gene_ngn3_high_ep'] = ngn3_high_cluster_data.layers['product_score'].mean(axis=0)
sorted_alpha_data = alpha_cluster_data.var['mean_product_score_per_gene_alpha'].sort_values().values
sorted_ngn3_high_data = ngn3_high_cluster_data.var['mean_product_score_per_gene_ngn3_high_ep'].sort_values().values
cdf_alpha = 1. * np.arange(len(sorted_alpha_data)) / (len(sorted_alpha_data) - 1)
cdf_ngn3_high = 1. * np.arange(len(sorted_ngn3_high_data)) / (len(sorted_ngn3_high_data) - 1)
df = pd.concat(
[
pd.DataFrame(
{
'Velocity coherence': sorted_alpha_data,
'CDF': cdf_alpha,
'Cell type': 'Alpha'
}
),
pd.DataFrame(
{
'Velocity coherence': sorted_ngn3_high_data,
'CDF': cdf_ngn3_high,
'Cell type': 'Ngn3 high EP'
}
),
],
axis=0
).reset_index(drop=True)
alpha_95_percentile = np.percentile(alpha_cluster_data.var['mean_product_score_per_gene_alpha'], 95)
sns.set_style("whitegrid")
with mplscience.style_context():
fig, ax = plt.subplots(figsize=(6, 4))
sns.lineplot(data=df, x='Velocity coherence', y='CDF', hue='Cell type', ax=ax)
plt.vlines(x=alpha_95_percentile, ymin=0, ymax=1, colors='black', linestyles='dashed')
if SAVE_FIGURES:
fig.savefig(
FIG_DIR / 'uncertainty' / 'pancreas' / 'mean_product_score_per_gene_cdf.svg',
format="svg",
transparent=True,
bbox_inches='tight'
)
sns.set_style("whitegrid")
with mplscience.style_context():
fig, ax = plt.subplots(figsize=(6, 4))
sns.lineplot(data=df, x='Velocity coherence', y='CDF', hue='Cell type', ax=ax)
ax.set_xlim(
[alpha_95_percentile, 0.007]
);
ax.set_ylim([0.875, 1])
if SAVE_FIGURES:
fig.savefig(
FIG_DIR / 'uncertainty' / 'pancreas' / 'mean_product_score_per_gene_cdf_zoom.svg',
format="svg",
transparent=True,
bbox_inches='tight'
)
df = pd.concat(
[
pd.DataFrame(
{
'Velocity coherence': alpha_cluster_data.var['mean_product_score_per_gene_alpha'],
'Cell type': 'Alpha'
}
),
pd.DataFrame(
{
'Velocity coherence': ngn3_high_cluster_data.var['mean_product_score_per_gene_ngn3_high_ep'],
'Cell type': 'Ngn3 high EP'
}
),
],
axis=0
)
df['Dataset'] = 'Pancreas'
sns.set_style("whitegrid")
with mplscience.style_context():
fig, ax = plt.subplots(figsize=(6, 4))
sns.violinplot(data=df, x='Dataset', y='Velocity coherence', hue='Cell type', split=True, palette='colorblind');
ax.set_ylim([-0.003, 0.005]);
if SAVE_FIGURES:
fig.savefig(
FIG_DIR / 'uncertainty' / 'pancreas' / 'mean_product_score_per_gene.svg',
format="svg",
transparent=True,
bbox_inches='tight'
)
fig.savefig(
FIG_DIR / 'uncertainty' / 'pancreas' / 'mean_product_score_per_gene.pdf',
transparent=True,
bbox_inches='tight'
)
/vol/storage/miniconda3/envs/velovi-py39/lib/python3.9/site-packages/seaborn/categorical.py:381: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
if LooseVersion(mpl.__version__) < "3.0":
/vol/storage/miniconda3/envs/velovi-py39/lib/python3.9/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
other = LooseVersion(other)
Alpha cells#
lb, ub = np.percentile(alpha_cluster_data.var['mean_product_score_per_gene_alpha'], [5, 95])
n_genes_above_upper_percentile = (alpha_cluster_data.var['mean_product_score_per_gene_alpha'] > ub).sum()
n_genes_below_lower_percentile = (alpha_cluster_data.var['mean_product_score_per_gene_alpha'] < lb).sum()
print(f'Number of genes above percentile: {n_genes_above_upper_percentile}')
Number of genes above percentile: 54
Ngn3 high EP cells#
n_genes_above_upper_percentile = (ngn3_high_cluster_data.var['mean_product_score_per_gene_ngn3_high_ep'] > ub).sum()
n_genes_below_lower_percentile = (ngn3_high_cluster_data.var['mean_product_score_per_gene_ngn3_high_ep'] < lb).sum()
print(f'Number of genes above percentile: {n_genes_above_upper_percentile}')
Number of genes above percentile: 135
Phase portraits - Alpha cells#
color = adata.obs['clusters'].astype(str).replace(
dict(zip(adata.obs['clusters'].cat.categories, adata.uns['clusters_colors']))
).tolist()
Highest velocity coherence
for gene in alpha_cluster_data.var['mean_product_score_per_gene_alpha'].sort_values().index[:5]:
plot_phase_portrait(adata, gene, color)
Lowest velocity coherence
for gene in alpha_cluster_data.var['mean_product_score_per_gene_alpha'].sort_values().index[-5:]:
plot_phase_portrait(adata, gene, color)
Phase portraits - Ngn3 high EP cells#
Highest velocity coherence
for gene in ngn3_high_cluster_data.var['mean_product_score_per_gene_ngn3_high_ep'].sort_values().index[:5]:
plot_phase_portrait(adata, gene, color)
Lowest velocity coherence
for gene in ngn3_high_cluster_data.var['mean_product_score_per_gene_ngn3_high_ep'].sort_values().index[-5:]:
plot_phase_portrait(adata, gene, color)