Make datasets for runtime analysis¶
We subsample the Heart cell atlas data at various intervals, add additional continuous covariates, and select the top 4000 genes
In [1]:
Copied!
import scvi
import anndata
import numpy as np
import pandas as pd
import time
import scanpy as sc
import scvi
import anndata
import numpy as np
import pandas as pd
import time
import scanpy as sc
In [2]:
Copied!
# download data from: https://www.heartcellatlas.org/
adata = anndata.read('../data/hca_raw.h5ad')
# download data from: https://www.heartcellatlas.org/
adata = anndata.read('../data/hca_raw.h5ad')
In [3]:
Copied!
adata.layers['counts'] = adata.X.copy() # move count data into a layer
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata) # log the data for better umap visualization later
adata.raw = adata
adata.layers['counts'] = adata.X.copy() # move count data into a layer
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata) # log the data for better umap visualization later
adata.raw = adata
In [4]:
Copied!
cont_nuisance_cov = ["percent_mito", "percent_ribo"]
cont_nuisance_cov = ["percent_mito", "percent_ribo"]
In [5]:
Copied!
# make 8 random nuisance covariates
for i in range(8):
key = 'rand_noise_{}'.format(i)
adata.obs[key] = np.random.normal(size = adata.n_obs)
cont_nuisance_cov+=[key]
# make 8 random nuisance covariates
for i in range(8):
key = 'rand_noise_{}'.format(i)
adata.obs[key] = np.random.normal(size = adata.n_obs)
cont_nuisance_cov+=[key]
In [6]:
Copied!
# all the continuous covariates
cont_nuisance_cov
# all the continuous covariates
cont_nuisance_cov
Out[6]:
['percent_mito', 'percent_ribo', 'rand_noise_0', 'rand_noise_1', 'rand_noise_2', 'rand_noise_3', 'rand_noise_4', 'rand_noise_5', 'rand_noise_6', 'rand_noise_7']
In [7]:
Copied!
adata.write('hca_processed.h5ad')
adata.write('hca_processed.h5ad')
In [28]:
Copied!
n_obs = [5000, 10000, 20000, 40000, 80000,160000,320000,486134]
n_covs = [10]
n_vars = [4000]
adata = anndata.read('hca_processed.h5ad')
#for backed mode
adata.strings_to_categoricals()
del adata.raw
for n_cov in n_covs:
for n_ob in n_obs:
for n_var in n_vars:
idx = np.random.choice(adata.n_obs,n_ob,replace=False)
tmp_adata = adata[idx].copy()
if 'batch' in tmp_adata.obs.keys():
del tmp_adata.obs['batch']
tmp_adata.obs['batch'] = ''
for cov in bbknn_covs:
tmp_adata.obs['batch'] = tmp_adata.obs['batch'].astype(str) + tmp_adata.obs[cov].astype(str)
# if one of the batches is less than 3, bbknn will fail
while np.min(np.unique(tmp_adata.obs['batch'], return_counts=True)[1]) < 3:
idx = np.random.choice(adata.n_obs,n_ob,replace=False)
tmp_adata = adata[idx].copy()
if 'batch' in tmp_adata.obs.keys():
del tmp_adata.obs['batch']
tmp_adata.obs['batch'] = ''
for cov in bbknn_covs:
tmp_adata.obs['batch'] = tmp_adata.obs['batch'].astype(str) + tmp_adata.obs[cov].astype(str)
sc.pp.highly_variable_genes(tmp_adata,
n_top_genes=n_var,
subset=True,
inplace=True,
flavor="seurat_v3",
layer='counts',
batch_key="cell_source",
)
message = '{}, {}, {}, '.format(n_var, n_cov, tmp_adata.n_obs)
print(message, tmp_adata.n_vars)
scvi.data.setup_anndata(
tmp_adata,
categorical_covariate_keys=["cell_source", "donor"],
layer="counts",
continuous_covariate_keys=cont_nuisance_cov
)
adata_fn = '{}_{}_{}.h5ad'.format(n_cov, n_ob, n_var)
tmp_adata.write(adata_fn)
n_obs = [5000, 10000, 20000, 40000, 80000,160000,320000,486134]
n_covs = [10]
n_vars = [4000]
adata = anndata.read('hca_processed.h5ad')
#for backed mode
adata.strings_to_categoricals()
del adata.raw
for n_cov in n_covs:
for n_ob in n_obs:
for n_var in n_vars:
idx = np.random.choice(adata.n_obs,n_ob,replace=False)
tmp_adata = adata[idx].copy()
if 'batch' in tmp_adata.obs.keys():
del tmp_adata.obs['batch']
tmp_adata.obs['batch'] = ''
for cov in bbknn_covs:
tmp_adata.obs['batch'] = tmp_adata.obs['batch'].astype(str) + tmp_adata.obs[cov].astype(str)
# if one of the batches is less than 3, bbknn will fail
while np.min(np.unique(tmp_adata.obs['batch'], return_counts=True)[1]) < 3:
idx = np.random.choice(adata.n_obs,n_ob,replace=False)
tmp_adata = adata[idx].copy()
if 'batch' in tmp_adata.obs.keys():
del tmp_adata.obs['batch']
tmp_adata.obs['batch'] = ''
for cov in bbknn_covs:
tmp_adata.obs['batch'] = tmp_adata.obs['batch'].astype(str) + tmp_adata.obs[cov].astype(str)
sc.pp.highly_variable_genes(tmp_adata,
n_top_genes=n_var,
subset=True,
inplace=True,
flavor="seurat_v3",
layer='counts',
batch_key="cell_source",
)
message = '{}, {}, {}, '.format(n_var, n_cov, tmp_adata.n_obs)
print(message, tmp_adata.n_vars)
scvi.data.setup_anndata(
tmp_adata,
categorical_covariate_keys=["cell_source", "donor"],
layer="counts",
continuous_covariate_keys=cont_nuisance_cov
)
adata_fn = '{}_{}_{}.h5ad'.format(n_cov, n_ob, n_var)
tmp_adata.write(adata_fn)
4000, 10, 5000, 4000 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.layers["counts"] INFO Computing library size prior per batch INFO Successfully registered anndata object containing 5000 cells, 4000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 2 extra categorical covariates and 10 extra continuous covariates. INFO Please do not further modify adata until model is trained.
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py:144: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. df.loc[: int(n_top_genes), 'highly_variable'] = True /home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2487: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version. res = method(*args, **kwargs) ... storing 'batch' as categorical
4000, 10, 10000, 4000 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.layers["counts"] INFO Computing library size prior per batch INFO Successfully registered anndata object containing 10000 cells, 4000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 2 extra categorical covariates and 10 extra continuous covariates. INFO Please do not further modify adata until model is trained.
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py:144: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. df.loc[: int(n_top_genes), 'highly_variable'] = True /home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2487: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version. res = method(*args, **kwargs) ... storing 'batch' as categorical
4000, 10, 20000, 4000 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.layers["counts"]
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py:144: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. df.loc[: int(n_top_genes), 'highly_variable'] = True
INFO Computing library size prior per batch INFO Successfully registered anndata object containing 20000 cells, 4000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 2 extra categorical covariates and 10 extra continuous covariates. INFO Please do not further modify adata until model is trained.
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2487: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version. res = method(*args, **kwargs) ... storing 'batch' as categorical /home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py:144: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. df.loc[: int(n_top_genes), 'highly_variable'] = True
4000, 10, 40000, 4000 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.layers["counts"] INFO Computing library size prior per batch INFO Successfully registered anndata object containing 40000 cells, 4000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 2 extra categorical covariates and 10 extra continuous covariates. INFO Please do not further modify adata until model is trained.
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2487: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version. res = method(*args, **kwargs) ... storing 'batch' as categorical /home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py:144: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. df.loc[: int(n_top_genes), 'highly_variable'] = True
4000, 10, 80000, 4000 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.layers["counts"] INFO Computing library size prior per batch INFO Successfully registered anndata object containing 80000 cells, 4000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 2 extra categorical covariates and 10 extra continuous covariates. INFO Please do not further modify adata until model is trained.
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2487: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version. res = method(*args, **kwargs) ... storing 'batch' as categorical /home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py:144: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. df.loc[: int(n_top_genes), 'highly_variable'] = True
4000, 10, 160000, 4000 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.layers["counts"] INFO Computing library size prior per batch
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2487: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version. res = method(*args, **kwargs)
INFO Successfully registered anndata object containing 160000 cells, 4000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 2 extra categorical covariates and 10 extra continuous covariates. INFO Please do not further modify adata until model is trained.
... storing 'batch' as categorical /home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py:144: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. df.loc[: int(n_top_genes), 'highly_variable'] = True
4000, 10, 320000, 4000 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.layers["counts"] INFO Computing library size prior per batch
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2487: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version. res = method(*args, **kwargs)
INFO Successfully registered anndata object containing 320000 cells, 4000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 2 extra categorical covariates and 10 extra continuous covariates. INFO Please do not further modify adata until model is trained.
... storing 'batch' as categorical /home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/scanpy/preprocessing/_highly_variable_genes.py:144: FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version. Use .loc with labels or .iloc with positions instead. df.loc[: int(n_top_genes), 'highly_variable'] = True
4000, 10, 486134, 4000 INFO No batch_key inputted, assuming all cells are same batch INFO No label_key inputted, assuming all cells have same label INFO Using data from adata.layers["counts"] INFO Computing library size prior per batch
/home/galen/.pyenv/versions/scvi-dev/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2487: FutureWarning: The `inplace` parameter in pandas.Categorical.remove_unused_categories is deprecated and will be removed in a future version. res = method(*args, **kwargs)
INFO Successfully registered anndata object containing 486134 cells, 4000 vars, 1 batches, 1 labels, and 0 proteins. Also registered 2 extra categorical covariates and 10 extra continuous covariates. INFO Please do not further modify adata until model is trained.
... storing 'batch' as categorical
In [ ]:
Copied!