Supplement CellAssign Runtime
Runtime¶
In [1]:
Copied!
import sys
#if True, will install via pypi, else will install from source
stable = True
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
!pip install --quiet scvi-tools[tutorials]==0.9.1
import sys
#if True, will install via pypi, else will install from source
stable = True
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
!pip install --quiet scvi-tools[tutorials]==0.9.1
In [2]:
Copied!
if IN_COLAB:
# import gdown
# url = 'https://drive.google.com/uc?id=1tJSOI9ve0i78WmszMLx2ul8F8tGycBTd'
# output = 'FL_celltype.csv'
# gdown.download(url, output, quiet=False)
!wget https://ndownloader.figshare.com/files/27458831 -O FL_celltype.csv
if IN_COLAB:
# import gdown
# url = 'https://drive.google.com/uc?id=1tJSOI9ve0i78WmszMLx2ul8F8tGycBTd'
# output = 'FL_celltype.csv'
# gdown.download(url, output, quiet=False)
!wget https://ndownloader.figshare.com/files/27458831 -O FL_celltype.csv
In [32]:
Copied!
import scvi
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time
import scvi
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time
In [33]:
Copied!
sns.reset_orig()
sc.settings._vector_friendly = True
# p9.theme_set(p9.theme_classic)
plt.rcParams["svg.fonttype"] = "none"
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["savefig.transparent"] = True
plt.rcParams["figure.figsize"] = (4, 4)
plt.rcParams["axes.titlesize"] = 15
plt.rcParams["axes.titleweight"] = 500
plt.rcParams["axes.titlepad"] = 8.0
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["axes.labelweight"] = 500
plt.rcParams["axes.linewidth"] = 1.2
plt.rcParams["axes.labelpad"] = 6.0
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False
plt.rcParams["font.size"] = 11
# plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Helvetica', "Computer Modern Sans Serif", "DejaVU Sans"]
plt.rcParams['font.weight'] = 500
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['xtick.minor.size'] = 1.375
plt.rcParams['xtick.major.size'] = 2.75
plt.rcParams['xtick.major.pad'] = 2
plt.rcParams['xtick.minor.pad'] = 2
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['ytick.minor.size'] = 1.375
plt.rcParams['ytick.major.size'] = 2.75
plt.rcParams['ytick.major.pad'] = 2
plt.rcParams['ytick.minor.pad'] = 2
plt.rcParams["legend.fontsize"] = 12
plt.rcParams['legend.handlelength'] = 1.4
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['legend.scatterpoints'] = 3
plt.rcParams['legend.frameon'] = False
plt.rcParams['lines.linewidth'] = 1.7
DPI = 300
sns.reset_orig()
sc.settings._vector_friendly = True
# p9.theme_set(p9.theme_classic)
plt.rcParams["svg.fonttype"] = "none"
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["savefig.transparent"] = True
plt.rcParams["figure.figsize"] = (4, 4)
plt.rcParams["axes.titlesize"] = 15
plt.rcParams["axes.titleweight"] = 500
plt.rcParams["axes.titlepad"] = 8.0
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["axes.labelweight"] = 500
plt.rcParams["axes.linewidth"] = 1.2
plt.rcParams["axes.labelpad"] = 6.0
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False
plt.rcParams["font.size"] = 11
# plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Helvetica', "Computer Modern Sans Serif", "DejaVU Sans"]
plt.rcParams['font.weight'] = 500
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['xtick.minor.size'] = 1.375
plt.rcParams['xtick.major.size'] = 2.75
plt.rcParams['xtick.major.pad'] = 2
plt.rcParams['xtick.minor.pad'] = 2
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['ytick.minor.size'] = 1.375
plt.rcParams['ytick.major.size'] = 2.75
plt.rcParams['ytick.major.pad'] = 2
plt.rcParams['ytick.minor.pad'] = 2
plt.rcParams["legend.fontsize"] = 12
plt.rcParams['legend.handlelength'] = 1.4
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['legend.scatterpoints'] = 3
plt.rcParams['legend.frameon'] = False
plt.rcParams['lines.linewidth'] = 1.7
DPI = 300
In [4]:
Copied!
adata = scvi.data.dataset_10x("fresh_68k_pbmc_donor_a")
adata.var_names_make_unique()
adata.obs_names_make_unique()
adata = scvi.data.dataset_10x("fresh_68k_pbmc_donor_a")
adata.var_names_make_unique()
adata.obs_names_make_unique()
INFO Downloading file at data/10X/fresh_68k_pbmc_donor_a/filtered_gene_bc_matrices.tar.gz Downloading...: 100%|██████████| 124443/124443.0 [00:02<00:00, 45794.69it/s] INFO Extracting tar file
In [5]:
Copied!
marker_gene_mat = pd.read_csv('data/FL_celltype.csv', index_col=0)
marker_gene_mat = pd.read_csv('data/FL_celltype.csv', index_col=0)
In [6]:
Copied!
marker_gene_mat = marker_gene_mat.drop(index='TRAC')
marker_gene_mat = marker_gene_mat.drop(index='TRAC')
In [41]:
Copied!
marker_gene_mat.shape
marker_gene_mat.shape
Out[41]:
(23, 5)
In [7]:
Copied!
bdata = adata[:, marker_gene_mat.index].copy()
scvi.data.setup_anndata(bdata)
bdata = adata[:, marker_gene_mat.index].copy()
scvi.data.setup_anndata(bdata)
INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.X INFO Computing library size prior per batch WARNING This dataset has some empty cells, this might fail inference.Data should be filtered with `scanpy.pp.filter_cells()` INFO Successfully registered anndata object containing 68579 cells, 23 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates. INFO Please do not further modify adata until model is trained.
In [10]:
Copied!
num_samples = []
times = []
n = bdata.n_obs
while n > 1000:
cdata = bdata[:n].copy()
scvi.data.setup_anndata(cdata)
start = time.time()
model = scvi.external.CellAssign(cdata, marker_gene_mat, "n_counts")
model.train()
end = time.time()
times += [end - start]
num_samples += [cdata.n_obs]
print(end - start)
n = n // 2
# bdata = sc.pp.subsample(bdata, fraction=0.5, copy=True)
num_samples = []
times = []
n = bdata.n_obs
while n > 1000:
cdata = bdata[:n].copy()
scvi.data.setup_anndata(cdata)
start = time.time()
model = scvi.external.CellAssign(cdata, marker_gene_mat, "n_counts")
model.train()
end = time.time()
times += [end - start]
num_samples += [cdata.n_obs]
print(end - start)
n = n // 2
# bdata = sc.pp.subsample(bdata, fraction=0.5, copy=True)
INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.X INFO Computing library size prior per batch WARNING This dataset has some empty cells, this might fail inference.Data should be filtered with `scanpy.pp.filter_cells()` INFO Successfully registered anndata object containing 68579 cells, 23 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates. INFO Please do not further modify adata until model is trained.
GPU available: True, used: True TPU available: None, using: 0 TPU cores
Epoch 340/400: 85%|████████▌ | 340/400 [01:54<00:20, 2.97it/s, loss=72.4, v_num=1] 114.59993028640747 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.X INFO Computing library size prior per batch WARNING This dataset has some empty cells, this might fail inference.Data should be filtered with `scanpy.pp.filter_cells()` INFO Successfully registered anndata object containing 34289 cells, 23 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates. INFO Please do not further modify adata until model is trained.
GPU available: True, used: True TPU available: None, using: 0 TPU cores
Epoch 282/400: 70%|███████ | 282/400 [00:48<00:20, 5.84it/s, loss=77.4, v_num=1] 48.27886652946472 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.X INFO Computing library size prior per batch WARNING This dataset has some empty cells, this might fail inference.Data should be filtered with `scanpy.pp.filter_cells()` INFO Successfully registered anndata object containing 17144 cells, 23 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates. INFO Please do not further modify adata until model is trained.
GPU available: True, used: True TPU available: None, using: 0 TPU cores
Epoch 400/400: 100%|██████████| 400/400 [00:36<00:00, 11.03it/s, loss=70.4, v_num=1] 36.27116823196411 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.X INFO Computing library size prior per batch WARNING This dataset has some empty cells, this might fail inference.Data should be filtered with `scanpy.pp.filter_cells()` INFO Successfully registered anndata object containing 8572 cells, 23 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates. INFO Please do not further modify adata until model is trained.
GPU available: True, used: True TPU available: None, using: 0 TPU cores
Epoch 400/400: 100%|██████████| 400/400 [00:19<00:00, 20.88it/s, loss=82.5, v_num=1] 19.164595127105713 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.X INFO Computing library size prior per batch WARNING This dataset has some empty cells, this might fail inference.Data should be filtered with `scanpy.pp.filter_cells()` INFO Successfully registered anndata object containing 4286 cells, 23 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates. INFO Please do not further modify adata until model is trained.
GPU available: True, used: True TPU available: None, using: 0 TPU cores
Epoch 400/400: 100%|██████████| 400/400 [00:10<00:00, 36.63it/s, loss=81.9, v_num=1] 10.929171800613403 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.X INFO Computing library size prior per batch WARNING This dataset has some empty cells, this might fail inference.Data should be filtered with `scanpy.pp.filter_cells()` INFO Successfully registered anndata object containing 2143 cells, 23 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates. INFO Please do not further modify adata until model is trained.
GPU available: True, used: True TPU available: None, using: 0 TPU cores
Epoch 400/400: 100%|██████████| 400/400 [00:06<00:00, 58.57it/s, loss=87.4, v_num=1] 6.837259292602539 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.X INFO Computing library size prior per batch WARNING This dataset has some empty cells, this might fail inference.Data should be filtered with `scanpy.pp.filter_cells()` INFO Successfully registered anndata object containing 1071 cells, 23 vars, 1 batches, 1 labels, and 0 proteins. Also registered 0 extra categorical covariates and 0 extra continuous covariates. INFO Please do not further modify adata until model is trained.
GPU available: True, used: True TPU available: None, using: 0 TPU cores
Epoch 400/400: 100%|██████████| 400/400 [00:04<00:00, 82.95it/s, loss=79.9, v_num=1] 4.830204725265503
In [26]:
Copied!
df = pd.DataFrame(data={'n_obs':num_samples,'time':times})
df["Implementation"] = "scvi-tools"
df = pd.DataFrame(data={'n_obs':num_samples,'time':times})
df["Implementation"] = "scvi-tools"
In [45]:
Copied!
r_time_df = pd.read_csv("data/cell_assign_r_runtime.csv", index_col=0)
r_time_df["Implementation"] = "Original"
r_time_df
r_time_df = pd.read_csv("data/cell_assign_r_runtime.csv", index_col=0)
r_time_df["Implementation"] = "Original"
r_time_df
Out[45]:
n_obs | time | Implementation | |
---|---|---|---|
1 | 68579 | 9678.959298 | Original |
2 | 34289 | 5046.696466 | Original |
3 | 17144 | 2501.130012 | Original |
4 | 8572 | 1000.611695 | Original |
5 | 4286 | 577.809937 | Original |
6 | 2143 | 217.614224 | Original |
7 | 1071 | 96.258606 | Original |
In [46]:
Copied!
total_df = pd.concat([df, r_time_df], axis=0)
total_df = pd.concat([df, r_time_df], axis=0)
In [49]:
Copied!
fig, ax = plt.subplots(1, 1)
sns.scatterplot(data = total_df, x="n_obs", y="time", hue="Implementation", ax=ax)
# ax.set(xscale="log", yscale="log")
ax.set_xlabel("Number of cells")
ax.set_ylabel("Time (s)")
ax.set(yscale="log")
fig.savefig("figs/cellassign_runtime.pdf", dpi=300, bbox_inches="tight")
fig, ax = plt.subplots(1, 1)
sns.scatterplot(data = total_df, x="n_obs", y="time", hue="Implementation", ax=ax)
# ax.set(xscale="log", yscale="log")
ax.set_xlabel("Number of cells")
ax.set_ylabel("Time (s)")
ax.set(yscale="log")
fig.savefig("figs/cellassign_runtime.pdf", dpi=300, bbox_inches="tight")
R code¶
library(SingleCellExperiment)
library(cellassign)
library(readr)
pbmc <- readRDS("data/pbmc.rds")
marker_mat <- read_csv("data/FL_celltype.csv")
marker_mat <- marker_mat[-c(24),]
col <- marker_mat[,1]
rownames <- col[['Gene']]
counts(pbmc) <- assay(pbmc, "X")
mm <- data.matrix(marker_mat)
mm <- mm[,-1]
rownames(mm) <- rownames
times <- c()
n_obs <- c()
n <- 68579
i <- 1
s <- colSums(counts(pbmc))
while (n > 1000){
start <- Sys.time()
fit <- cellassign(exprs_obj = pbmc[rownames,1:n],
marker_gene_info = mm,
s=s[1:n],
learning_rate = 1e-2,
shrinkage = TRUE,
verbose = TRUE)
times[i] <- difftime(Sys.time(), start, units='secs')
n_obs[i] <- n
i <- i + 1
n <- n %/% 2
}
df <- data.frame("n_obs" = n_obs, "time" = times)
write.csv(df, "data/cell_assign_r_runtime.csv")
In [ ]:
Copied!