Data#

Get the Data#

Note

We are going to download a set of 6 PBMC 10x datasets from 3 covid-19 patients and 3 healthy controls, the samples have been subsampled to 1500 cells per sample.

#@title check working Directory
!pwd
/content/drive/MyDrive/scRNA_using_Python
#@title check content of folder
!ls -lh
total 18K
drwx------ 2 root root 4.0K May 14 16:39 data
-rw------- 1 root root    0 May 28 04:27 new_file_in_working_directory.txt
drwx------ 2 root root 4.0K May 14 16:21 Objects
-rw------- 1 root root 9.2K May 28 04:45 requirementsscRNA.txt
-rw------- 1 root root    1 May 17 20:08 requirements.txt
#@title Download data from URL

%%bash
# create a data directory.
mkdir -p data/raw

# first check if the files are there
count=$(ls -l data/raw/*.h5 | grep -v ^d | wc -l )
echo $count

# if not 4 files, fetch the files from github.
if (("$count" <  6)); then
  cd data/raw
  curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/Normal_PBMC_13.h5
  curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/Normal_PBMC_14.h5
  curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/Normal_PBMC_5.h5
  curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/nCoV_PBMC_15.h5
  curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/nCoV_PBMC_17.h5
  curl -O https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/covid_data_GSE149689/sub/nCoV_PBMC_1.h5
  cd ../..
fi

ls -lGa data/raw
6
total 22176
-rw------- 1 root 3169573 May 14 16:39 nCoV_PBMC_15.h5
-rw------- 1 root 4105636 May 14 16:39 nCoV_PBMC_17.h5
-rw------- 1 root 3426598 May 14 16:39 nCoV_PBMC_1.h5
-rw------- 1 root 4391693 May 14 16:39 Normal_PBMC_13.h5
-rw------- 1 root 3806925 May 14 16:39 Normal_PBMC_14.h5
-rw------- 1 root 3806384 May 14 16:39 Normal_PBMC_5.h5
!tree
.
├── data
│   ├── raw
│   │   ├── nCoV_PBMC_15.h5
│   │   ├── nCoV_PBMC_17.h5
│   │   ├── nCoV_PBMC_1.h5
│   │   ├── Normal_PBMC_13.h5
│   │   ├── Normal_PBMC_14.h5
│   │   └── Normal_PBMC_5.h5
│   └── regev_lab_cell_cycle_genes.txt
├── new_file_in_working_directory.txt
├── Objects
│   ├── adata_raw_covid.h5ad
│   ├── sc_qc_filtered_covid.h5ad
│   ├── sc_QCFN_covid.h5ad
│   ├── sc_QCNFS_covid.h5ad
│   └── sc_QCNFSDM_covid.h5ad
├── requirementsscRNA.txt
└── requirements.txt

3 directories, 15 files
#@title Load data individually


data_cov1 = sc.read_10x_h5('./data/raw/nCoV_PBMC_1.h5')
data_cov1.var_names_make_unique()
data_cov15 = sc.read_10x_h5('./data/raw/nCoV_PBMC_15.h5')
data_cov15.var_names_make_unique()
data_cov17 = sc.read_10x_h5('./data/raw/nCoV_PBMC_17.h5')
data_cov17.var_names_make_unique()
data_ctrl5 = sc.read_10x_h5('./data/raw/Normal_PBMC_5.h5')
data_ctrl5.var_names_make_unique()
data_ctrl13 = sc.read_10x_h5('./data/raw/Normal_PBMC_13.h5')
data_ctrl13.var_names_make_unique()
data_ctrl14 = sc.read_10x_h5('./data/raw/Normal_PBMC_14.h5')
data_ctrl14.var_names_make_unique()

#@title Create Merged Object

# add some metadata
data_cov1.obs['type']="Covid"
data_cov1.obs['sample']="covid_1"
data_cov15.obs['type']="Covid"
data_cov15.obs['sample']="covid_15"
data_cov17.obs['type']="Covid"
data_cov17.obs['sample']="covid_17"
data_ctrl5.obs['type']="Ctrl"
data_ctrl5.obs['sample']="ctrl_5"
data_ctrl13.obs['type']="Ctrl"
data_ctrl13.obs['sample']="ctrl_13"
data_ctrl14.obs['type']="Ctrl"
data_ctrl14.obs['sample']="ctrl_14"


# merge into one object.
adata = data_cov1.concatenate(data_cov15, data_cov17, data_ctrl5, data_ctrl13, data_ctrl14)

# and delete individual datasets to save space
del(data_cov1, data_cov15, data_cov17)
del(data_ctrl5, data_ctrl13, data_ctrl14)


#@title Summary of object

# Print summary of object to see 
# how many samples and how many genes we have

print(" Cells count: ", adata.n_obs)
print(" Genes count: ", adata.n_vars)

print("=====================================")



print(adata.obs['sample'].value_counts())

print("=====================================")

print(adata.obs['type'].value_counts())

print("=====================================")

adata.to_df()
 Cells count:  9000
 Genes count:  33538
=====================================
covid_1     1500
covid_15    1500
covid_17    1500
ctrl_5      1500
ctrl_13     1500
ctrl_14     1500
Name: sample, dtype: int64
=====================================
Covid    4500
Ctrl     4500
Name: type, dtype: int64
=====================================
MIR1302-2HG FAM138A OR4F5 AL627309.1 AL627309.3 AL627309.2 AL627309.4 AL732372.1 OR4F29 AC114498.1 ... AC007325.2 BX072566.1 AL354822.1 AC023491.2 AC004556.1 AC233755.2 AC233755.1 AC240274.1 AC213203.1 FAM231C
AGGGTCCCATGACCCG-1-0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
TACCCACAGCGGGTTA-1-0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
CCCAACTTCATATGGC-1-0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
TCAAGTGTCCGAACGC-1-0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
ATTCCTAGTGACTGTT-1-0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
CGCATAATCTTACGGA-14-5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
GAGGCCTTCTCCTGCA-14-5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
CCCTAACAGTTTCTTC-14-5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
GGGATGATCAAGCTTG-14-5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
CAATGACCACTGCATA-14-5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

9000 rows × 33538 columns

print(adata)

print("======================================================================")
print(adata.var.head())

print("=====================================================================")

print(adata.obs.head())
AnnData object with n_obs × n_vars = 9000 × 33538
    obs: 'type', 'sample', 'batch'
    var: 'gene_ids', 'feature_types', 'genome'
======================================================================
                    gene_ids    feature_types  genome
MIR1302-2HG  ENSG00000243485  Gene Expression  GRCh38
FAM138A      ENSG00000237613  Gene Expression  GRCh38
OR4F5        ENSG00000186092  Gene Expression  GRCh38
AL627309.1   ENSG00000238009  Gene Expression  GRCh38
AL627309.3   ENSG00000239945  Gene Expression  GRCh38
=====================================================================
                       type   sample batch
AGGGTCCCATGACCCG-1-0  Covid  covid_1     0
TACCCACAGCGGGTTA-1-0  Covid  covid_1     0
CCCAACTTCATATGGC-1-0  Covid  covid_1     0
TCAAGTGTCCGAACGC-1-0  Covid  covid_1     0
ATTCCTAGTGACTGTT-1-0  Covid  covid_1     0
#@title Save rawdata adata before QC as adata_raw :

adata_raw = adata

# If you want to update the data file, you can save the filtered data

os.makedirs('Objects/', exist_ok=True)

save_file = 'Objects/adata_raw_covid.h5ad'
adata.write_h5ad(save_file)



adata_raw
AnnData object with n_obs × n_vars = 9000 × 33538
    obs: 'type', 'sample', 'batch'
    var: 'gene_ids', 'feature_types', 'genome'
adata_raw
AnnData object with n_obs × n_vars = 9000 × 33538
    obs: 'type', 'sample', 'batch'
    var: 'gene_ids', 'feature_types', 'genome'