Python - Concept

Python - Concept#

In this we are going to learn some python Basic concepts

Python Basics#

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

# A string within a pair of triple single quotes, can have multiple lines
seq_5 = '''MALNSGSPPA
IGPYYENHGY HHSYSYSH
UUYSTTSGSG'''
print(seq_5)
print('---------')

MALNSGSPPA
IGPYYENHGY HHSYSYSH
UUYSTTSGSG
---------

dna_seq = 'ATGCGGCTCAGCATGCGACTATATATGCCACTACCGCGCCGC'

print(dna_seq.lower())

print("---------")

print(dna_seq)

print("------------")

print(dna_seq.count("GC"))
print(dna_seq.count("AT"))
print("--------")
print(dna_seq.find("GCATG"))
print(dna_seq[10:19])
print(dna_seq.rfind("CCGC"))
print(len(dna_seq))
print(dna_seq.swapcase())
print(dna_seq.replace("CCGC", "ATCGC", 2))
print(dna_seq.rfind("ATGC"))

atgcggctcagcatgcgactatatatgccactaccgcgccgc
---------
ATGCGGCTCAGCATGCGACTATATATGCCACTACCGCGCCGC
------------
8
5
--------
10
GCATGCGAC
38
42
atgcggctcagcatgcgactatatatgccactaccgcgccgc
ATGCGGCTCAGCATGCGACTATATATGCCACTAATCGCGATCGC
24

dna_seq[4:-5]

'GGCTCAGCATGCGACTATATATGCCACTACCGC'

first_row = '6,148,72,35,0,33.6,0.627,50,1'

Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome = first_row.split(',')

print(Age)
print(BloodPressure)
print(Outcome)

50
72
1

drug_name = ["dapoline", "crocin", "cyclopam", "eldoper"]
drug_name
drug_name[:3]

['dapoline', 'crocin', 'cyclopam']

list_in_list = [[2,3,4], ["str", "df", "wer"], [23,12], [10,23,111,231,12,145]]
list_in_list
print(list_in_list)
print(list_in_list[3][2:5])

[[2, 3, 4], ['str', 'df', 'wer'], [23, 12], [10, 23, 111, 231, 12, 145]]
[111, 231, 12]

x=drug_name.remove("crocin")
print(drug_name)

['dapoline', 'cyclopam', 'eldoper']

print(drug_name.sort())
print(drug_name)

None
['cyclopam', 'dapoline', 'eldoper']

print(type(list_in_list))
print(type(first_row))

<class 'list'>
<class 'str'>

dict1 = {"list1" : [1,2,3], "list2" : [4,6]}
print(dict1)

print(dict1.get("list1"))

print(dict1["list1"][0])

dict2 = {"list3" : ["star", "month"], "list4": ["washing", "belgium"]}

dict = {**dict1, **dict2}
print(dict)
print(dict.keys())

{'list1': [1, 2, 3], 'list2': [4, 6]}
[1, 2, 3]
1
{'list1': [1, 2, 3], 'list2': [4, 6], 'list3': ['star', 'month'], 'list4': ['washing', 'belgium']}
dict_keys(['list1', 'list2', 'list3', 'list4'])

protein = { 'uniprot_ID' : 'P232425', 'Name' : 'Prolinte', 'seq' : 'ATGGTTGSFMN', 'lenght' : 10 }
print(protein)
print(type(protein))

{'uniprot_ID': 'P232425', 'Name': 'Prolinte', 'seq': 'ATGGTTGSFMN', 'lenght': 10}
<class 'dict'>

print(True == False)

False

control_expr = 9
treated_expr = 3.7

if control_expr < treated_expr:
  print("Gene downregulated")
else:
  print("Gene Upregulated")

Gene Upregulated

gene1_length = 10
gene2_length = 15
gene3_length = 40

if gene1_length < gene2_length:
  print(" Gene 2 is longer than gene1")
if gene3_length > gene2_length:
  print("Gene3 is longest of all 3 genes")

 Gene 2 is longer than gene1
Gene3 is longest of all 3 genes

#!pip install colorama

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colorama
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.5

#----- To check presence of an item in list and substring of string --- #we use in keyword
from colorama import Fore, Back, Style


usr_input = input("please enter drug name: ")

if usr_input in drug_name:
  print(f"{Fore.WHITE} {Back.GREEN} {Style.BRIGHT} {usr_input}  ")
else:
  print(f"{Fore.WHITE} {Back.RED} {Style.BRIGHT} {usr_input} ")

please enter drug name: sinus77
   sinus77 

#-----while Loop  is uses stop-condition //// untill stop condition is true the statement executed
a = int(input("enter a value: "))

while a < 6:
  print(f" {a} is bigger than 6")
  a = a+1
else:
  print(f"{a} is smaller than 6")

enter a value: 3
is bigger than 6
is bigger than 6
is bigger than 6
is smaller than 6

dna_list = ['AGGGC','ATTGGCCTT','AGGTTCC','GGCCTCA','TTTCCGGCTA','CCGCGTA']

print(dna_list)

print(len(dna_list))

['AGGGC', 'ATTGGCCTT', 'AGGTTCC', 'GGCCTCA', 'TTTCCGGCTA', 'CCGCGTA']
6

# to find longer dna from dna_list we need lenght of each dna items , so to find each dna item length we use index
print(len(dna_list))  # lenght of list , total number of items
print(len(dna_list[1]))  # lenght of charcters in dna item from dna list at 1 st index

6
9

number_of_items = len(dna_list)
#print(number_of_items)

index = 0 # start with 0 th element
# now we have index start and total elements now we can loop
longer = 0
while(index < number_of_items):
  print(index)
  current = len(dna_list[index])
  print("length is ", current)
  print("-------------")
  if current > longer:
    longer = current
    print("new longer dna is: ",longer)
    longer_dna_index = dna_list[index]
    print("longer dna  is :", longer_dna_index, "and lenght is ", longer)


  index = index + 1

print("longest dna  is :", longer_dna_index, "and lenght is ", longer)

0
length is  5
-------------
new longer dna is:  5
longer dna  is : AGGGC and lenght is  5
1
length is  9
-------------
new longer dna is:  9
longer dna  is : ATTGGCCTT and lenght is  9
2
length is  7
-------------
3
length is  7
-------------
4
length is  10
-------------
new longer dna is:  10
longer dna  is : TTTCCGGCTA and lenght is  10
5
length is  7
-------------
longest dna  is : TTTCCGGCTA and lenght is  10

number_of_items = len(dna_list)
index = 0
longer = 0

while(index < number_of_items):
  current = len(dna_list[index])
  if current > longer:
    longer = current
    long_dna = dna_list[index]
  index = index +1

print(long_dna, longer)

TTTCCGGCTA 10

for x in dna_list:
  index = 0
  current = len(dna_list[index])
  if current > longer:
    longer = current
  index = index + 1
print(longer)

for drug in drug_name:
  print(drug)

dapoline
crocin
cyclopam
eldoper

import numpy as np

for x in range(10):
  x = x**2+np.sin(x)
  print(x)

0
8414709848078965
909297426825682
141120008059866
243197504692072
04107572533686
72058450180107
65698659871879
98935824662338
41211848524176

for x in range(6 ,10):
  print(x)

def Biolove():
  print("I love Biology")

Biolove()

I love Biology

#--- if we have set of control expression and treated expression of a gene then finding mean expression and calculating which is upregulated

control_expr = [2,4.0, 5.3, 8, 12]
treated_expr = [14, 23, 0.3,0.5, 1.3]

print(len(control_expr))
print(len(treated_expr))

def Mean_expr(expression):
  total_items = len(expression)
  total_exp =  sum(expression)
  mean_expr = (total_exp/total_items)
  print(total_exp)
  return mean_expr

control_mean_expr = Mean_expr(control_expr)
treated_mean_expr = Mean_expr(treated_expr)

if control_mean_expr > treated_mean_expr:
  print("control is upregulated by", control_mean_expr - treated_mean_expr)

else:
  print("Treated is upregulated by", treated_mean_expr - control_mean_expr)

5
5
31.3
39.099999999999994
Treated is upregulated by 1.5599999999999987

import random
for i in range(10):
  x = random.random()
  print(x)

2370533475793002
618866633308759
3693540170724592
6234544733235856
6391075513952862
8020779639302317
8054811697150666
18663645783024552
2873486855738768
23477485117858765

# Define class

class Mrna:
  # class attribute
  contains = "Nucleic Acid"
  # initializer with instance attributes
  def __init__(self, gene_name,seq):

      self.gene_name = gene_name
      self.seq = seq
  # method-1
  def Details(self):
    print("Gene name is ", self.gene_name, " and sequence is ", self.seq)

  # method -2
  def cDNA(self):
    print("its complimentary dna is ", self.seq.replace("U","T"))

mRNA_1 = Mrna("IGKJ1", "GUGGACGUUCGGCCAAGGGACCAAGGUGGAAAUCAAAC")

mRNA_1.Details()

Gene name is  IGKJ1  and sequence is  GUGGACGUUCGGCCAAGGGACCAAGGUGGAAAUCAAAC

Biopython#

from platform import python_version
print(python_version())

!pip install biopython
#!pip install --upgrade biopython
import Bio
print(Bio.__version__)

3.7.13
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
     |████████████████████████████████| 2.3 MB 26.9 MB/s 
?25hRequirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from biopython) (1.21.6)
Installing collected packages: biopython
Successfully installed biopython-1.79
1.79

# Like above we have created class which do 2 functions . like wise biopython has class Seq to do various functions
import Bio
from Bio.Seq import Seq

my_seq = Seq("ATGCGGCTAAT")
print(type(my_seq))

<class 'Bio.Seq.Seq'>

# it is important to mention whether it is DNA or RNA in biophython sequence , we get generic dna from Bio.Alphabet


my_seq = Seq("ATGCTAGGCATAG")
dna = Seq("ATGGTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
print(type(dna))
print(len(dna))

transcript = dna.transcribe()
print(transcript)
print(type(transcript))

protein1  = transcript.translate()
print(protein1)
print(len(protein1))

protein2  = transcript.translate(table= 2, cds=True)
print(protein2)
print(len(protein2))

protein3 = dna.translate()
print(protein3)

<class 'Bio.Seq.Seq'>
42
AUGGUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
<class 'Bio.Seq.Seq'>
MVAIVMGR*KGAR*
14
MVAIVMGRWKGAR
13
MVAIVMGR*KGAR*

—-SeqRecord

from typing import Generic
from Bio.Seq import Seq


from Bio.SeqRecord import SeqRecord

pr_record = SeqRecord(Seq("MRAKWRKKRMRRLKRKRRKMRQRSK"), id = "P62945", name= "RL41_HUMAN", description= "60S ribosomal protein L41" )

print(pr_record)
print(type(pr_record))

ID: P62945
Name: RL41_HUMAN
Description: 60S ribosomal protein L41
Number of features: 0
Seq('MRAKWRKKRMRRLKRKRRKMRQRSK')
<class 'Bio.SeqRecord.SeqRecord'>

from Bio import SeqIO

from Bio.Seq import Seq

from Bio.SeqRecord import SeqRecord

NC_005816_record = SeqIO.read("/content/drive/MyDrive/GenBank_data/NC_005816.gb", "genbank")
print(NC_005816_record)

ID: NC_005816.1
Name: NC_005816
Description: Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence
Database cross-references: BioProject:PRJNA224116, BioSample:SAMN02602970, Assembly:GCF_000007885.1
Number of features: 19
/molecule_type=DNA
/topology=circular
/data_file_division=CON
/date=19-JUN-2022
/accessions=['NC_005816']
/sequence_version=1
/keywords=['RefSeq']
/source=Yersinia pestis biovar Microtus str. 91001
/organism=Yersinia pestis biovar Microtus str. 91001
/taxonomy=['Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 'Enterobacterales', 'Yersiniaceae', 'Yersinia']
/references=[Reference(title='Genetics of metabolic variations between Yersinia pestis biovars and the proposal of a new biovar, microtus', ...), Reference(title='Complete genome sequence of Yersinia pestis strain 91001, an isolate avirulent to humans', ...), Reference(title='Direct Submission', ...)]
/comment=REFSEQ INFORMATION: The reference sequence is identical to
AE017046.1.
The annotation was added by the NCBI Prokaryotic Genome Annotation
Pipeline (PGAP). Information about PGAP can be found here:
https://www.ncbi.nlm.nih.gov/genome/annotation_prok/
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Date', '06/19/2022 10:59:08'), ('Annotation Pipeline', 'NCBI Prokaryotic Genome Annotation Pipeline (PGAP)'), ('Annotation Method', 'Best-placed reference protein set; GeneMarkS-2+'), ('Annotation Software revision', '6.1'), ('Features Annotated', 'Gene; CDS; rRNA; tRNA; ncRNA; repeat_region'), ('Genes (total)', '4,363'), ('CDSs (total)', '4,259'), ('Genes (coding)', '4,040'), ('CDSs (with protein)', '4,040'), ('Genes (RNA)', '104'), ('rRNAs', '8, 7, 7 (5S, 16S, 23S)'), ('complete rRNAs', '8, 7, 7 (5S, 16S, 23S)'), ('tRNAs', '72'), ('ncRNAs', '10'), ('Pseudo Genes (total)', '219'), ('CDSs (without protein)', '219'), ('Pseudo Genes (ambiguous residues)', '0 of 219'), ('Pseudo Genes (frameshifted)', '95 of 219'), ('Pseudo Genes (incomplete)', '151 of 219'), ('Pseudo Genes (internal stop)', '32 of 219'), ('Pseudo Genes (multiple problems)', '49 of 219'), ('CRISPR Arrays', '1')]))])
/contig=join(AE017046.1:1..9609)
Undefined sequence of length 9609

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
sequence = ""
with open("/content/drive/MyDrive/GenBank_data/sample.fasta.txt", "r") as f:
  lines = f.readlines()
  for line in lines:
    if line.startswith(">"):
      description= line.rstrip()
    else:
      sequence = sequence + line.rstrip()

protein_record = SeqRecord(Seq(sequence), id = description, name= description, description= description)

print(protein_record)

ID: >sp|Q9SE35|20-107
Name: >sp|Q9SE35|20-107
Description: >sp|Q9SE35|20-107
Number of features: 0
Seq('QSIADLAAANLSTEDSKSAQLISADSSDDASDSSVESVDAASSDVSGSSVESVD...RIL')

# Above code write with minimal lines of code using biopython parse method

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

records = Bio.SeqIO.parse("/content/drive/MyDrive/GenBank_data/sample.fasta.txt", "fasta")
for record in records:
  print(record)

ID: sp|Q9SE35|20-107
Name: sp|Q9SE35|20-107
Description: sp|Q9SE35|20-107
Number of features: 0
Seq('QSIADLAAANLSTEDSKSAQLISADSSDDASDSSVESVDAASSDVSGSSVESVD...RIL')

# Multi sequence fasta file

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# records_list = []

# records =  Bio.SeqIO.parse("/content/drive/MyDrive/GenBank_data/multisequence.txt", "fasta")

#for record in records:
  #print(record.id)
  #records_list.append(record)


# record_list = []
for record in Bio.SeqIO.parse("/content/drive/MyDrive/GenBank_data/multisequence.fasta.txt", "fasta"):
  print(record.id)
  #record_list.append(record)

spQ3ZM63ETDA_HUMAN
spP53803RPAB4_HUMAN
spQ538Z0LUZP6_HUMAN
spQ9BZ97TTY13_HUMAN
spP58511SI11A_HUMAN

seq1 = "TGATGCACATTGA"
seq2 = "TAGACATGACACCACAG"

from Bio import Align

aligner = Align.PairwiseAligner()
aligner
print(aligner)
print("\n ============== \n")
results = aligner.align(seq1, seq2)
print(results)
for each in results:
  print(each)
score = aligner.score(seq1, seq2)
print(score)

from Bio.Seq import Seq

sequence = Seq("ATTTTCTTGCTCTTGAGCTCTGGCACTTCTCTGCTGCTGTC")

from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
from Bio.Blast import ParseBlastTable

result_handle =NCBIWWW.qblast("blastn", "nt", sequence= sequence)
blast_records = NCBIXML.parse(result_handle)
print(blast_records)
print("----------------")

<generator object parse at 0x7f03abf1e5d0>
----------------

#blast_records = NCBIXML.parse(result_handle)

#for b in blast_records:
 # for alignment in b.alignments[:2]:
  #  print(alignment)

print(blast_records)
print("----------------")
#blast_records = NCBIXML.parse(result_handle)
for b in blast_records:

  for alignment in b.alignments[:2]:
    for hsp in alignment.hsps:
      print('****Alignment****')
      print('sequence:', alignment.title)
      print('length:', alignment.length)
      print('e value:', hsp.expect)
      print(hsp.query[0:75] + '…')
      print(hsp.match[0:75] + '…')
      print(hsp.sbjct[0:75] + '…')

<generator object parse at 0x7f03abf7c050>
----------------

Python for Data Analysis#

import numpy as np

l = [1, 2, 3]

a = np.array([1, 2, 3])

print("list is : ", l)
print("array is :", a)

# iterate over each element in list and array by using for loop

for each in l:
  print(each)

print("-------------")

for each in a:
  print(each)



print("added list is :",l + [4])
print("------------")

print("added array is:", a + 4)
print("added array with np.array is:", a + np.array(4))


# --- In list if we want to do mathematical operations like addition substraction multiplication we have use loop to iterate each element and then perform mathematical operation
# --- where as in Numpy array No need of looping directly mathematical operation perform on each element

# {Similarly ** exponent  work directly on NUmpy array No need for loope , where as for list need looping and new empty list}

l2 = []
for each in l:
  l2.append(each **2)
print(l2)

a= a **2
print(a)

list is :  [1, 2, 3]
array is : [1 2 3]
1
2
3
-------------
1
2
3
added list is : [1, 2, 3, 4]
------------
added array is: [5 6 7]
added array with np.array is: [5 6 7]
[1, 4, 9]
[1 4 9]

n =np.array( [l, l2, [5, 6, 7], [8,3,2]] )
print(n)

[[1 2 3]
 [1 4 9]
 [5 6 7]
 [8 3 2]]

n = np.array([[1,2,3], [4,5,6]])
print(n)
print("--------------")
print(n.T) # Transpose of array matrix
print("--------------")

print(np.exp(n))
print("--------------")
print(np.exp(n.T))
print("--------------")
print(np.sqrt(n))
print("--------------")
print(np.exp([[1,2,3],[4,5,6]]))

[[1 2 3]
 [4 5 6]]
--------------
[[1 4]
 [2 5]
 [3 6]]
--------------
[[  2.71828183   7.3890561   20.08553692]
 [ 54.59815003 148.4131591  403.42879349]]
--------------
[[  2.71828183  54.59815003]
 [  7.3890561  148.4131591 ]
 [ 20.08553692 403.42879349]]
--------------
[[1.         1.41421356 1.73205081]
 [2.         2.23606798 2.44948974]]
--------------
[[  2.71828183   7.3890561   20.08553692]
 [ 54.59815003 148.4131591  403.42879349]]

import numpy as np
a = np.array([[1,2,3],[4,5,6]])
b= np.array([[1,2,2]])
c=  np.array([[6,7,8], [2,4,5]])
print(a)
print("-----------")
print(b)
print("-----------")
print(c)
print("-----------")
print(a*b)
print("-----------")
print("-----------")
print(a * c)

[[1 2 3]
 [4 5 6]]
-----------
[[1 2 2]]
-----------
[[6 7 8]
 [2 4 5]]
-----------
[[ 1  4  6]
 [ 4 10 12]]
-----------
-----------
[[ 6 14 24]
 [ 8 20 30]]

import numpy as np

# Ax = B
# x = BA-1

a = np.array([[1,2,3],[4,5,6]]) #Shape(2×3)
b = np.array([[1,2],[3,4]])#Shape(2×2)

print("a is:", a)
print("----------")
print("b is :", b)
print("-----------")

print(a.dot(b))

import numpy as np
a = np.array([[5,6],[7,8]])
print("a is : \n", a)
print("----------")
print(np.linalg.det(a))
print("----------")
a_inv =np.linalg.inv(a)
print(a.shape)
print("-------")


print(np.linalg.inv(a))
print("-------")
print(a_inv.shape)

print("----------")

print(a. dot( a_inv))

a is : 
 [[5 6]
 [7 8]]
----------
-2.000000000000005
----------
(2, 2)
-------
[[-4.   3. ]
 [ 3.5 -2.5]]
-------
(2, 2)
----------
[[1. 0.]
 [0. 1.]]

# solve
#   -3x 2y + 4z = 9
#   3y 2z = 5
#   4x 3y + 2z = 7


# left side is one matrix and rght side is another matrix Ax = B  ===> x = BA-1

# take the inverse  of left matrix and multiply (dot matrix muliplication)by right matrix we get solution x, y, z

A = np.array( [[-3, 2, 4], [0, 3, 2], [4, 3, 2]] )
print(A)
print("---------")

B =np.array( [9,5,7])
print(B)

print("#####################")

print(np.linalg.inv(A).dot(B)) # here x = 0.5, y= -0.125, z= 2.6875

print("---------------------------------------------------------------------------")

print(" SImilarly we can get by using np.linalg.solve method by passing 2 matrix")

print(np.linalg.solve(A, B))

[[-3  2  4]
 [ 0  3  2]
 [ 4  3  2]]
---------
[9 5 7]
#####################
[ 0.5    -0.125   2.6875]
---------------------------------------------------------------------------
 SImilarly we can get by using np.linalg.solve method by passing 2 matrix
[ 0.5    -0.125   2.6875]

Pandas

import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Diabetes/diabetes.csv")
print(type(df))
print(df.head)
print("-------------")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
<bound method NDFrame.head of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                       0.351   31        0  
2                       0.672   32        1  
3                       0.167   21        0  
4                       2.288   33        1  
..                        ...  ...      ...  
763                     0.171   63        0  
764                     0.340   27        0  
765                     0.245   30        0  
766                     0.349   47        1  
767                     0.315   23        0  

[768 rows x 9 columns]>
-------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

df.columns= ['Pregnancies', 'Glucose', 'BP', 'ST', 'Insulin','BMI', 'DPF', 'Age', 'Outcome']
df.head()
print(df["Glucose"])
print(type(df["Glucose"]))

print("-------------------------------------------")

print(df[["Age","BMI","Outcome"]])
print(type(df[["Age","BMI","Outcome"]]))

    148
     85
    183
     89
    137
      ... 
  101
  122
  121
  126
   93
Name: Glucose, Length: 768, dtype: int64
<class 'pandas.core.series.Series'>
-------------------------------------------
     Age   BMI  Outcome
   50  33.6        1
   31  26.6        0
   32  23.3        1
   21  28.1        0
   33  43.1        1
..   ...   ...      ...
 63  32.9        0
 27  36.8        0
 30  26.2        0
 47  30.1        1
 23  30.4        0

[768 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>

df.head()

	Pregnancies	Glucose	BP	ST	Insulin	BMI	DPF	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

df.loc[2:4:,"BP": "DPF"]

	BP	ST	Insulin	BMI	DPF
2	64	0	0	23.3	0.672
3	66	23	94	28.1	0.167
4	40	35	168	43.1	2.288

print(df.iloc[2:5])
print(df.loc[2:5])

   Pregnancies  Glucose  BP  ST  Insulin   BMI    DPF  Age  Outcome
          8      183  64   0        0  23.3  0.672   32        1
          1       89  66  23       94  28.1  0.167   21        0
          0      137  40  35      168  43.1  2.288   33        1
   Pregnancies  Glucose  BP  ST  Insulin   BMI    DPF  Age  Outcome
          8      183  64   0        0  23.3  0.672   32        1
          1       89  66  23       94  28.1  0.167   21        0
          0      137  40  35      168  43.1  2.288   33        1
          5      116  74   0        0  25.6  0.201   30        0

df.head()

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

df.values

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

Matplotlib

# we have cell growth rate or 2 cancer cell lines bg01 and wa07 and we have numbers as fold change values of growth
bg01 = [0.00,26.70,69.89,176.14,448.30,590.91]
wa07 = [0.00,21.88,126.56,438.28,706.25,840.63]
days = [1,2,3,4,5,6]

print(bg01)
print(type(bg01))
print(wa07)
print(type(wa07))

[0.0, 26.7, 69.89, 176.14, 448.3, 590.91]
<class 'list'>
[0.0, 21.88, 126.56, 438.28, 706.25, 840.63]
<class 'list'>

# plot bg01 groth rate curve

import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(days, bg01, "g-*")
plt.xlabel("Number of Dyas")
plt.ylabel("Fold cahnge of cell line")
plt.title("Groth rate cycle of ' bg01 ' cell line\n\n")
plt.legend()
plt.show()

No handles with labels found to put in legend.

../../_images/5e5cc7910492ddb01428fdeee091245f297c5a0a35d29eb1050a705dadccbaeb.png

# plotting both plots in same using subplot

# plt.subplot(number of rows, number of columns, plot number)

plt.subplot(1,2,1)
plt.plot(days, bg01, "g-*", scaley= True, scalex= True)
plt.xlabel("Number of Days")
plt.ylabel("Fold change cell rate")
plt.title(" growth rate cycle of # bg01 # cell line")


plt.subplot(1, 2, 2)
plt.plot(days, wa07, "r-^", scaley= True, scalex= True)
plt.xlabel("Nmber of Days")
plt.ylabel("Fold change values of cell growth")
plt.title("Growth rate cycle of # wa07 # cell line")

plt.tight_layout()
plt.figure(figsize= (6,4), dpi= 300, )


plt.show()

../../_images/71ebee23533de71d1c03e7d44932ecb0a41bbec0d8708cb743406899a2cf6aec.png

<Figure size 1800x1200 with 0 Axes>

df.head()

plt.hist(df["Glucose"])
plt.xlabel("GLucose concentration")
plt.ylabel("observations")

Text(0, 0.5, 'observations')

../../_images/666cc24c22834abf7104fef597785d27cbb62d07d888639f3a193e670bbd9c02.png

df.head()

	Pregnancies	Glucose	BP	ST	Insulin	BMI	DPF	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

diabetic = df[df['Outcome']==1]['Glucose']
non_diabetic = df[df['Outcome']==0]['Glucose']
plt.boxplot([diabetic,non_diabetic],labels = ['Diabetic','Non-Diabetic'])
plt.ylabel("Blood Glucose Level")
plt.show()

/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))

../../_images/b0fc5484a42cb1262ca7199ab7e5320d4a342442bb9fd240fb6cd58842d23019.png

Seaborn

import seaborn as sns
iris = sns.load_dataset("iris")
iris.head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

from IPython.core.pylabtools import figsize
# set figure and axes
fig, axes = plt.subplots(nrows = 1, ncols=2 , figsize=(8, 10))

axes_0 = sns.distplot(iris["sepal_length"], kde= True, hist= True, bins= 10, ax= axes[0])
axes_0.set_title("Sepal lenght distribution with 10 bins")

axes_1 = sns.distplot(iris["sepal_length"], kde= True, hist= True, bins= 20, ax = axes[1])
axes_1.set_title("Sepal lenght distribution with 20 bins")

/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Text(0.5, 1.0, 'Sepal lenght distribution with 20 bins')

../../_images/a7550a5fd70266a34b53b64e9080437ed0bc260cf49bcfa4d2896075c9dc2850.png

sns.jointplot(x = "petal_length", y = "petal_width", data= iris, kind= "reg")

<seaborn.axisgrid.JointGrid at 0x7f273fdcb390>

../../_images/2d1f06e8c32bcca7e40e92dc8446cc5f7501a9316a03dc7c8cd688f9c2b834eb.png

penguins = sns.load_dataset("penguins")
print(penguins)

    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
  Adelie  Torgersen            39.1           18.7              181.0   
  Adelie  Torgersen            39.5           17.4              186.0   
  Adelie  Torgersen            40.3           18.0              195.0   
  Adelie  Torgersen             NaN            NaN                NaN   
  Adelie  Torgersen            36.7           19.3              193.0   
..      ...        ...             ...            ...                ...   
Gentoo     Biscoe             NaN            NaN                NaN   
Gentoo     Biscoe            46.8           14.3              215.0   
Gentoo     Biscoe            50.4           15.7              222.0   
Gentoo     Biscoe            45.2           14.8              212.0   
Gentoo     Biscoe            49.9           16.1              213.0   

     body_mass_g     sex  
       3750.0    Male  
       3800.0  Female  
       3250.0  Female  
          NaN     NaN  
       3450.0  Female  
..           ...     ...  
        NaN     NaN  
     4850.0  Female  
     5750.0    Male  
     5200.0  Female  
     5400.0    Male  

[344 rows x 7 columns]

penguins.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

penguins_final = penguins.dropna()
print(penguins_final)
print(penguins_final.isna().sum())

    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen            39.1           18.7              181.0   
1    Adelie  Torgersen            39.5           17.4              186.0   
2    Adelie  Torgersen            40.3           18.0              195.0   
4    Adelie  Torgersen            36.7           19.3              193.0   
5    Adelie  Torgersen            39.3           20.6              190.0   
..      ...        ...             ...            ...                ...   
338  Gentoo     Biscoe            47.2           13.7              214.0   
340  Gentoo     Biscoe            46.8           14.3              215.0   
341  Gentoo     Biscoe            50.4           15.7              222.0   
342  Gentoo     Biscoe            45.2           14.8              212.0   
343  Gentoo     Biscoe            49.9           16.1              213.0   

     body_mass_g     sex  
0         3750.0    Male  
1         3800.0  Female  
2         3250.0  Female  
4         3450.0  Female  
5         3650.0    Male  
..           ...     ...  
338       4925.0  Female  
340       4850.0  Female  
341       5750.0    Male  
342       5200.0  Female  
343       5400.0    Male  

[333 rows x 7 columns]
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

sns.pairplot(penguins_final, hue= "species")

<seaborn.axisgrid.PairGrid at 0x7f273f47b0d0>

../../_images/a74a6c28aed7d9c161097859c0e5fe3dba4d81594aaf87a6d095e4e9e031d469.png

penguins_final.head()

sns.lmplot(x = "body_mass_g", y="bill_depth_mm", data= penguins_final, hue= "sex")

<seaborn.axisgrid.FacetGrid at 0x7f2741d06fd0>

../../_images/52a7105d6ed4bddf9a28d6517e08ebb6e1c6252b0eb0b09b3194f836c112b591.png

—Plotly—–

!pip install plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: plotly in /usr/local/lib/python3.7/dist-packages (5.5.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from plotly) (8.0.1)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from plotly) (1.15.0)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pandas as pd

pd.options.plotting.backend = "plotly"

# dataset from seaborn
penguins = sns.load_dataset("penguins")
penguins.head()

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female

penguins["body_mass_g"].hist()

penguins["bill_depth_mm"].hist()

Python - Concept

Contents

Python - Concept#

Python Basics#

Biopython#

Python for Data Analysis#