Data#
Get the Data#
Note
We are going to download a set of 6 PBMC 10x datasets from 3 covid-19 patients and 3 healthy controls, the samples have been subsampled to 1500 cells per sample.
#@title check working Directory
!pwd
/content/drive/MyDrive/scRNA_using_Python
#@title check content of folder
!ls -lh
total 18K
drwx------ 2 root root 4.0K May 14 16:39 data
-rw------- 1 root root 0 May 28 04:27 new_file_in_working_directory.txt
drwx------ 2 root root 4.0K May 14 16:21 Objects
-rw------- 1 root root 9.2K May 28 04:45 requirementsscRNA.txt
-rw------- 1 root root 1 May 17 20:08 requirements.txt
#@title Download data from URL
%%bash
# create a data directory.
mkdir -p data/raw
# first check if the files are there
count=$(ls -l data/raw/*.h5 | grep -v ^d | wc -l )
echo $count
# if not 4 files, fetch the files from github.
if (("$count" < 6)); then
cd data/raw
curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/Normal_PBMC_13.h5
curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/Normal_PBMC_14.h5
curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/Normal_PBMC_5.h5
curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/nCoV_PBMC_15.h5
curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/nCoV_PBMC_17.h5
curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/nCoV_PBMC_1.h5
cd ../..
fi
ls -lGa data/raw
6
total 22176
-rw------- 1 root 3169573 May 14 16:39 nCoV_PBMC_15.h5
-rw------- 1 root 4105636 May 14 16:39 nCoV_PBMC_17.h5
-rw------- 1 root 3426598 May 14 16:39 nCoV_PBMC_1.h5
-rw------- 1 root 4391693 May 14 16:39 Normal_PBMC_13.h5
-rw------- 1 root 3806925 May 14 16:39 Normal_PBMC_14.h5
-rw------- 1 root 3806384 May 14 16:39 Normal_PBMC_5.h5
!tree
.
├── data
│ ├── raw
│ │ ├── nCoV_PBMC_15.h5
│ │ ├── nCoV_PBMC_17.h5
│ │ ├── nCoV_PBMC_1.h5
│ │ ├── Normal_PBMC_13.h5
│ │ ├── Normal_PBMC_14.h5
│ │ └── Normal_PBMC_5.h5
│ └── regev_lab_cell_cycle_genes.txt
├── new_file_in_working_directory.txt
├── Objects
│ ├── adata_raw_covid.h5ad
│ ├── sc_qc_filtered_covid.h5ad
│ ├── sc_QCFN_covid.h5ad
│ ├── sc_QCNFS_covid.h5ad
│ └── sc_QCNFSDM_covid.h5ad
├── requirementsscRNA.txt
└── requirements.txt
3 directories, 15 files
#@title Load data individually
data_cov1 = sc.read_10x_h5('./data/raw/nCoV_PBMC_1.h5')
data_cov1.var_names_make_unique()
data_cov15 = sc.read_10x_h5('./data/raw/nCoV_PBMC_15.h5')
data_cov15.var_names_make_unique()
data_cov17 = sc.read_10x_h5('./data/raw/nCoV_PBMC_17.h5')
data_cov17.var_names_make_unique()
data_ctrl5 = sc.read_10x_h5('./data/raw/Normal_PBMC_5.h5')
data_ctrl5.var_names_make_unique()
data_ctrl13 = sc.read_10x_h5('./data/raw/Normal_PBMC_13.h5')
data_ctrl13.var_names_make_unique()
data_ctrl14 = sc.read_10x_h5('./data/raw/Normal_PBMC_14.h5')
data_ctrl14.var_names_make_unique()
#@title Create Merged Object
# add some metadata
data_cov1.obs['type']="Covid"
data_cov1.obs['sample']="covid_1"
data_cov15.obs['type']="Covid"
data_cov15.obs['sample']="covid_15"
data_cov17.obs['type']="Covid"
data_cov17.obs['sample']="covid_17"
data_ctrl5.obs['type']="Ctrl"
data_ctrl5.obs['sample']="ctrl_5"
data_ctrl13.obs['type']="Ctrl"
data_ctrl13.obs['sample']="ctrl_13"
data_ctrl14.obs['type']="Ctrl"
data_ctrl14.obs['sample']="ctrl_14"
# merge into one object.
adata = data_cov1.concatenate(data_cov15, data_cov17, data_ctrl5, data_ctrl13, data_ctrl14)
# and delete individual datasets to save space
del(data_cov1, data_cov15, data_cov17)
del(data_ctrl5, data_ctrl13, data_ctrl14)
#@title Summary of object
# Print summary of object to see
# how many samples and how many genes we have
print(" Cells count: ", adata.n_obs)
print(" Genes count: ", adata.n_vars)
print("=====================================")
print(adata.obs['sample'].value_counts())
print("=====================================")
print(adata.obs['type'].value_counts())
print("=====================================")
adata.to_df()
Cells count: 9000
Genes count: 33538
=====================================
covid_1 1500
covid_15 1500
covid_17 1500
ctrl_5 1500
ctrl_13 1500
ctrl_14 1500
Name: sample, dtype: int64
=====================================
Covid 4500
Ctrl 4500
Name: type, dtype: int64
=====================================
MIR1302-2HG | FAM138A | OR4F5 | AL627309.1 | AL627309.3 | AL627309.2 | AL627309.4 | AL732372.1 | OR4F29 | AC114498.1 | ... | AC007325.2 | BX072566.1 | AL354822.1 | AC023491.2 | AC004556.1 | AC233755.2 | AC233755.1 | AC240274.1 | AC213203.1 | FAM231C | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AGGGTCCCATGACCCG-1-0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
TACCCACAGCGGGTTA-1-0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
CCCAACTTCATATGGC-1-0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
TCAAGTGTCCGAACGC-1-0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
ATTCCTAGTGACTGTT-1-0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
CGCATAATCTTACGGA-14-5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
GAGGCCTTCTCCTGCA-14-5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
CCCTAACAGTTTCTTC-14-5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
GGGATGATCAAGCTTG-14-5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
CAATGACCACTGCATA-14-5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
9000 rows × 33538 columns
print(adata)
print("======================================================================")
print(adata.var.head())
print("=====================================================================")
print(adata.obs.head())
AnnData object with n_obs × n_vars = 9000 × 33538
obs: 'type', 'sample', 'batch'
var: 'gene_ids', 'feature_types', 'genome'
======================================================================
gene_ids feature_types genome
MIR1302-2HG ENSG00000243485 Gene Expression GRCh38
FAM138A ENSG00000237613 Gene Expression GRCh38
OR4F5 ENSG00000186092 Gene Expression GRCh38
AL627309.1 ENSG00000238009 Gene Expression GRCh38
AL627309.3 ENSG00000239945 Gene Expression GRCh38
=====================================================================
type sample batch
AGGGTCCCATGACCCG-1-0 Covid covid_1 0
TACCCACAGCGGGTTA-1-0 Covid covid_1 0
CCCAACTTCATATGGC-1-0 Covid covid_1 0
TCAAGTGTCCGAACGC-1-0 Covid covid_1 0
ATTCCTAGTGACTGTT-1-0 Covid covid_1 0
#@title Save rawdata adata before QC as adata_raw :
adata_raw = adata
# If you want to update the data file, you can save the filtered data
os.makedirs('Objects/', exist_ok=True)
save_file = 'Objects/adata_raw_covid.h5ad'
adata.write_h5ad(save_file)
adata_raw
AnnData object with n_obs × n_vars = 9000 × 33538
obs: 'type', 'sample', 'batch'
var: 'gene_ids', 'feature_types', 'genome'
adata_raw
AnnData object with n_obs × n_vars = 9000 × 33538
obs: 'type', 'sample', 'batch'
var: 'gene_ids', 'feature_types', 'genome'