AI Concepts

Contents

AI Concepts#

In this we are going to learn some AI concepts taking general and medical examples

Principle component Analysis#

# principle component analysis is all about preserving Variance as much as we can , because variance is informative

# Variance is how far a varible value is from mean of dataset, variables are features .. so we reduce the features by preserving variance

# eigen matrix (which has eigen values and eigen vectors) multiplied by X raw features = to get new features

# new features are not the subset of raw features

# lets understand by using iris dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

iris = sns.load_dataset("iris")
iris.head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

iris.describe()

	sepal_length	sepal_width	petal_length	petal_width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

X = iris.iloc[:, 0:4]
print(X)
Y = iris.iloc[:, 4]
print(Y)

# Standardization , where mean = 0 and sd =1
iris_std = (X-X.mean())/ X.std()
print(iris_std)

# concatenate Y
iris_std = pd.concat([iris_std, Y], axis=1)
print(iris_std)

     sepal_length  sepal_width  petal_length  petal_width
           5.1          3.5           1.4          0.2
           4.9          3.0           1.4          0.2
           4.7          3.2           1.3          0.2
           4.6          3.1           1.5          0.2
           5.0          3.6           1.4          0.2
..            ...          ...           ...          ...
         6.7          3.0           5.2          2.3
         6.3          2.5           5.0          1.9
         6.5          3.0           5.2          2.0
         6.2          3.4           5.4          2.3
         5.9          3.0           5.1          1.8

[150 rows x 4 columns]
       setosa
       setosa
       setosa
       setosa
       setosa
         ...    
  virginica
  virginica
  virginica
  virginica
  virginica
Name: species, Length: 150, dtype: object
     sepal_length  sepal_width  petal_length  petal_width
     -0.897674     1.015602     -1.335752    -1.311052
     -1.139200    -0.131539     -1.335752    -1.311052
     -1.380727     0.327318     -1.392399    -1.311052
     -1.501490     0.097889     -1.279104    -1.311052
     -1.018437     1.245030     -1.335752    -1.311052
..            ...          ...           ...          ...
    1.034539    -0.131539      0.816859     1.443994
    0.551486    -1.278680      0.703564     0.919223
    0.793012    -0.131539      0.816859     1.050416
    0.430722     0.786174      0.930154     1.443994
    0.068433    -0.131539      0.760211     0.788031

[150 rows x 4 columns]
     sepal_length  sepal_width  petal_length  petal_width    species
     -0.897674     1.015602     -1.335752    -1.311052     setosa
     -1.139200    -0.131539     -1.335752    -1.311052     setosa
     -1.380727     0.327318     -1.392399    -1.311052     setosa
     -1.501490     0.097889     -1.279104    -1.311052     setosa
     -1.018437     1.245030     -1.335752    -1.311052     setosa
..            ...          ...           ...          ...        ...
    1.034539    -0.131539      0.816859     1.443994  virginica
    0.551486    -1.278680      0.703564     0.919223  virginica
    0.793012    -0.131539      0.816859     1.050416  virginica
    0.430722     0.786174      0.930154     1.443994  virginica
    0.068433    -0.131539      0.760211     0.788031  virginica

[150 rows x 5 columns]

print(iris.describe())
print(iris_std.describe())

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000
       sepal_length   sepal_width  petal_length   petal_width
count  1.500000e+02  1.500000e+02  1.500000e+02  1.500000e+02
mean  -5.684342e-16 -7.815970e-16 -2.842171e-16 -3.789561e-16
std    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00
min   -1.863780e+00 -2.425820e+00 -1.562342e+00 -1.442245e+00
25%   -8.976739e-01 -5.903951e-01 -1.222456e+00 -1.179859e+00
50%   -5.233076e-02 -1.315388e-01  3.353541e-01  1.320673e-01
75%    6.722490e-01  5.567457e-01  7.602115e-01  7.880307e-01
max    2.483699e+00  3.080455e+00  1.779869e+00  1.706379e+00

# Lets see by standaridization whether we preserved the information or variance by plotting

print("Plots showed distribution and scatter preserved , No change , where as data standardized at center zero" )
print("---------------------------------------------------------------------------------------------------------")
fig, axes = plt.subplots(nrows= 2, ncols= 2, figsize = (20, 10))
print(axes)

axes_0 = sns.distplot(iris['petal_length'], hist= True, kde= True, ax = axes[0,0])
axes_0.set_title("raw iris petal length distribution plot")

axes_1 = sns.distplot(iris_std['petal_length'], hist= True, kde= True, ax= axes[0,1])
axes_1.set_title("standaridized iris petal length distribution plot")

axes_2 = sns.scatterplot(x= "petal_length", y= "petal_width" , data= iris, hue= 'species', ax = axes[1,0])
axes_2.set_title(" raw iris scatter plot")

axes_2 = sns.scatterplot(x= "petal_length", y= "petal_width", data= iris_std, hue= 'species', ax = axes[1,1])
axes_2.set_title(" standardized iris scatter plot")

Plots showed distribution and scatter preserved , No change , where as data standardized at center zero
---------------------------------------------------------------------------------------------------------
[[<matplotlib.axes._subplots.AxesSubplot object at 0x7f3d5bf2ae50>
  <matplotlib.axes._subplots.AxesSubplot object at 0x7f3d5be46790>]
 [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3d5bdfdd50>
  <matplotlib.axes._subplots.AxesSubplot object at 0x7f3d5bdc3250>]]

/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Text(0.5, 1.0, ' standardized iris scatter plot')

../../_images/4b9860dd4c3665d31251bf92d046fcd6760008c18a3a3f05658dc4bfe169732d.png

# Now data is standardized , next we can perform
# ---- preparing correlation matrix using numpy
#------ finding eigen vector and eigen values using matrix linalg.eig
# ----

X_std =  iris_std.iloc[:, 0:4]
print(X_std)

y = iris_std.iloc[:,4]
print(y)

     sepal_length  sepal_width  petal_length  petal_width
     -0.897674     1.015602     -1.335752    -1.311052
     -1.139200    -0.131539     -1.335752    -1.311052
     -1.380727     0.327318     -1.392399    -1.311052
     -1.501490     0.097889     -1.279104    -1.311052
     -1.018437     1.245030     -1.335752    -1.311052
..            ...          ...           ...          ...
    1.034539    -0.131539      0.816859     1.443994
    0.551486    -1.278680      0.703564     0.919223
    0.793012    -0.131539      0.816859     1.050416
    0.430722     0.786174      0.930154     1.443994
    0.068433    -0.131539      0.760211     0.788031

[150 rows x 4 columns]
       setosa
       setosa
       setosa
       setosa
       setosa
         ...    
  virginica
  virginica
  virginica
  virginica
  virginica
Name: species, Length: 150, dtype: object

from sklearn.decomposition import PCA

sklearn_pca = PCA(n_components= 2, random_state= 100)
PCs = sklearn_pca.fit_transform(X_std)
print(PCs.shape)
print(X_std.shape)
print(PCs)

(150, 2)
(150, 4)
[[-2.25714118  0.47842383]
 [-2.07401302 -0.67188269]
 [-2.35633511 -0.34076642]
 [-2.29170679 -0.59539986]
 [-2.3818627   0.64467566]
 [-2.06870061  1.4842053 ]
 [-2.43586845  0.04748512]
 [-2.22539189  0.222403  ]
 [-2.32684533 -1.1116037 ]
 [-2.17703491 -0.46744757]
 [-2.15907699  1.04020587]
 [-2.31836413  0.132634  ]
 [-2.2110437  -0.72624318]
 [-2.62430902 -0.95829635]
 [-2.19139921  1.85384655]
 [-2.25466121  2.67731523]
 [-2.20021676  1.47865573]
 [-2.18303613  0.48720613]
 [-1.89223284  1.40032757]
 [-2.33554476  1.1240836 ]
 [-1.90793125  0.40749058]
 [-2.19964383  0.92103587]
 [-2.76508142  0.4568133 ]
 [-1.81259716  0.08527285]
 [-2.21972701  0.13679618]
 [-1.9453293  -0.62352971]
 [-2.04430277  0.24135499]
 [-2.1613365   0.52538942]
 [-2.13241965  0.312172  ]
 [-2.25769799 -0.33660425]
 [-2.13297647 -0.50285608]
 [-1.82547925  0.42228039]
 [-2.60621687  1.78758727]
 [-2.43800983  2.1435468 ]
 [-2.10292986 -0.45866527]
 [-2.20043723 -0.20541922]
 [-2.03831765  0.65934923]
 [-2.51889339  0.59031516]
 [-2.42152026 -0.90116107]
 [-2.16246625  0.2679812 ]
 [-2.27884081  0.44024054]
 [-1.85191836 -2.32961074]
 [-2.54511203 -0.47750102]
 [-1.95788857  0.47074961]
 [-2.12992356  1.13841546]
 [-2.06283361 -0.70867859]
 [-2.37677076  1.11668869]
 [-2.38638171 -0.38495723]
 [-2.22200263  0.99462767]
 [-2.19647504  0.00918558]
 [ 1.09810244  0.86009103]
 [ 0.72889556  0.59262936]
 [ 1.2368358   0.61423989]
 [ 0.40612251 -1.7485462 ]
 [ 1.07188379 -0.20772515]
 [ 0.38738955 -0.59130272]
 [ 0.74403715  0.77043827]
 [-0.48569562 -1.846244  ]
 [ 0.92480346  0.03211848]
 [ 0.01138804 -1.03056578]
 [-0.10982834 -2.64521111]
 [ 0.43922201 -0.06308385]
 [ 0.56023148 -1.75883213]
 [ 0.71715934 -0.18560282]
 [-0.03324333 -0.43753742]
 [ 0.87248429  0.50736424]
 [ 0.34908221 -0.19565627]
 [ 0.1582798  -0.78945101]
 [ 1.22100316 -1.61682728]
 [ 0.16436725 -1.29825994]
 [ 0.73521959  0.39524745]
 [ 0.47469691 -0.41592689]
 [ 1.23005729 -0.93020944]
 [ 0.63074514 -0.41499744]
 [ 0.70031506 -0.06320009]
 [ 0.87135454  0.24995602]
 [ 1.25231375 -0.07699807]
 [ 1.35386953  0.33020546]
 [ 0.66258066 -0.2251735 ]
 [-0.04012419 -1.05518358]
 [ 0.13035846 -1.55705555]
 [ 0.02337438 -1.56722524]
 [ 0.2407318  -0.7746612 ]
 [ 1.05755171 -0.6317269 ]
 [ 0.22323093 -0.28681266]
 [ 0.42770626  0.84275892]
 [ 1.04522645  0.52030871]
 [ 1.04104379 -1.37837105]
 [ 0.06935597 -0.21877043]
 [ 0.28253073 -1.32488615]
 [ 0.27814596 -1.11628885]
 [ 0.62248441  0.02483981]
 [ 0.33540673 -0.98510383]
 [-0.36097409 -2.01249582]
 [ 0.28762268 -0.85287312]
 [ 0.09105561 -0.18058714]
 [ 0.22695654 -0.38363487]
 [ 0.57446378 -0.15435649]
 [-0.4461723  -1.53863746]
 [ 0.25587339 -0.59685229]
 [ 1.83841002  0.86751506]
 [ 1.15401555 -0.6965364 ]
 [ 2.19790361  0.56013398]
 [ 1.43534213 -0.0468307 ]
 [ 1.86157577  0.2940597 ]
 [ 2.74268509  0.79773671]
 [ 0.36579225 -1.55628918]
 [ 2.29475181  0.41866302]
 [ 1.99998633 -0.70906323]
 [ 2.25223216  1.9145963 ]
 [ 1.35962064  0.6904434 ]
 [ 1.59732747 -0.42029243]
 [ 1.87761053  0.41784981]
 [ 1.25590769 -1.15837974]
 [ 1.46274487 -0.44079488]
 [ 1.5847682   0.67398689]
 [ 1.46651849  0.25476833]
 [ 2.4182277   2.5481248 ]
 [ 3.29964148  0.01772158]
 [ 1.25954707 -1.70104672]
 [ 2.03091256  0.90742744]
 [ 0.97471535 -0.56985526]
 [ 2.8879765   0.41225995]
 [ 1.32878064 -0.4802025 ]
 [ 1.6950553   1.01053648]
 [ 1.94780139  1.00441272]
 [ 1.17118007 -0.31533806]
 [ 1.01754169  0.06413118]
 [ 1.78237879 -0.18673563]
 [ 1.85742501  0.56041329]
 [ 2.4278203   0.25841871]
 [ 2.29723178  2.61755442]
 [ 1.85648383 -0.17795333]
 [ 1.1104277  -0.29194458]
 [ 1.19845835 -0.80860636]
 [ 2.78942561  0.85394254]
 [ 1.57099294  1.06501321]
 [ 1.34179696  0.42102015]
 [ 0.92173701  0.01716559]
 [ 1.84586124  0.67387065]
 [ 2.00808316  0.61183593]
 [ 1.89543421  0.68727307]
 [ 1.15401555 -0.6965364 ]
 [ 2.03374499  0.86462403]
 [ 1.99147547  1.04566567]
 [ 1.86425786  0.38567404]
 [ 1.55935649 -0.89369285]
 [ 1.51609145  0.26817075]
 [ 1.36820418  1.00787793]
 [ 0.95744849 -0.02425043]]

iris_transform_df = pd.DataFrame(PCs, columns=("PC1", "PC2"))
iris_transform_df = pd.concat([iris_transform_df, y], axis = 1)

iris_transform_df.head()


fig, ax = plt.subplots(figsize = (10,8))

ax_0 = sns.scatterplot("PC1", "PC2", data= iris_transform_df, hue= "species")
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

../../_images/ef7a66505eac46dcdfa6a0794807cf85468def4f486cd0c35f870ebc6626c962.png

# The art of enabling machine to form rules and find trends from data without explicitly programming them is known as Machine Learning

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

dataset = pd.read_csv("/content/drive/MyDrive/Diabetes/diabetes.txt", delimiter= "\t")
dataset.head()

	AGE	SEX	BMI	BP	S1	S2	S3	S4	S5	S6	Y
0	59	2	32.1	101.0	157	93.2	38.0	4.0	4.8598	87	151
1	48	1	21.6	87.0	183	103.2	70.0	3.0	3.8918	69	75
2	72	2	30.5	93.0	156	93.6	41.0	4.0	4.6728	85	141
3	24	1	25.3	84.0	198	131.4	40.0	5.0	4.8903	89	206
4	50	1	23.0	101.0	192	125.4	52.0	4.0	4.2905	80	135

print(dataset.info())

print("----------------------------------------------------------------------")

print(dataset.isna().sum())
print("---------------------------------------------------------------------")

print(dataset.describe().T)

# Seperate the dataset into independent and depent features

X = dataset.iloc[:, :-1]
print(X)
print("-------------------------------------------------------------------------")
y= dataset.iloc[:, -1]
print(y)

     AGE  SEX   BMI      BP   S1     S2    S3    S4      S5   S6
   59    2  32.1  101.00  157   93.2  38.0  4.00  4.8598   87
   48    1  21.6   87.00  183  103.2  70.0  3.00  3.8918   69
   72    2  30.5   93.00  156   93.6  41.0  4.00  4.6728   85
   24    1  25.3   84.00  198  131.4  40.0  5.00  4.8903   89
   50    1  23.0  101.00  192  125.4  52.0  4.00  4.2905   80
..   ...  ...   ...     ...  ...    ...   ...   ...     ...  ...
 60    2  28.2  112.00  185  113.8  42.0  4.00  4.9836   93
 47    2  24.9   75.00  225  166.0  42.0  5.00  4.4427  102
 60    2  24.9   99.67  162  106.6  43.0  3.77  4.1271   95
 36    1  30.0   95.00  201  125.2  42.0  4.79  5.1299   85
 36    1  19.6   71.00  250  133.2  97.0  3.00  4.5951   92

[442 rows x 10 columns]
-------------------------------------------------------------------------
    151
     75
    141
    206
    135
      ... 
  178
  104
  132
  220
   57
Name: Y, Length: 442, dtype: int64

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 110)
print(X_train)
print(X_test)
print("-----------------------------------------------------")
print(y_train)
print(y_test)

     AGE  SEX   BMI      BP   S1     S2    S3    S4      S5   S6
  64    2  27.3  109.00  186  107.6  38.0  5.00  5.3083   99
 56    2  25.8  103.00  177  114.4  34.0  5.00  4.9628   99
 54    1  23.2  110.67  238  162.8  48.0  4.96  4.9127  108
 55    1  28.2   91.00  250  140.2  67.0  4.00  5.3660  103
 54    2  27.7  113.00  200  128.4  37.0  5.00  5.1533  113
..   ...  ...   ...     ...  ...    ...   ...   ...     ...  ...
 29    2  18.1   73.00  158   99.0  41.0  4.00  4.4998   78
 59    2  27.2  107.00  158  102.0  39.0  4.00  4.4427   93
  37    2  26.8   79.00  157   98.0  28.0  6.00  5.0434   96
 67    2  23.6  111.33  189  105.4  70.0  2.70  4.2195   93
 34    1  20.6   87.00  185  112.2  58.0  3.00  4.3041   74

[353 rows x 10 columns]
     AGE  SEX   BMI      BP   S1     S2    S3    S4      S5   S6
 55    2  23.5   93.00  177  126.8  41.0  4.00  3.8286   83
 47    1  30.4  120.00  199  120.0  46.0  4.00  5.1059   87
 28    1  24.2   93.00  174  106.4  54.0  3.00  4.2195   84
 53    1  22.0   94.00  175   88.0  59.0  3.00  4.9416   98
 28    1  30.4   85.00  198  115.6  67.0  3.00  4.3438   80
..   ...  ...   ...     ...  ...    ...   ...   ...     ...  ...
 71    2  27.0   93.33  269  190.2  41.0  6.56  5.2417   93
 33    2  20.8   84.00  125   70.2  46.0  3.00  3.7842   66
 53    1  26.5   97.00  193  122.4  58.0  3.00  4.1431   99
 51    2  32.8  112.00  202  100.6  37.0  5.00  5.7746  109
 49    2  31.9   94.00  234  155.8  34.0  7.00  5.3982  122

[89 rows x 10 columns]
-----------------------------------------------------
   150
  164
  190
  262
  297
      ... 
  104
  127
   144
  108
  115
Name: Y, Length: 353, dtype: int64
   55
  195
  144
  200
  103
      ... 
  131
   70
   49
  277
  268
Name: Y, Length: 89, dtype: int64

from sklearn.linear_model import LinearRegression
L_regressor = LinearRegression()
L_regressor.fit(X_train, y_train)

#---- test model on test data X_test
prediction = L_regressor.predict(X_test)
print(prediction)
print(y_test)

plt.scatter(y_test, prediction)
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.title("Actual values vs predicted values")

[ 88.7257154  236.01349565 122.82671275 152.91194894 133.26299572
10995617 108.45070799 166.16370245 206.40303033 227.12512886
58436461 257.33397866 188.25079196 143.91214004 101.58303786
18862987 157.18468301 140.00779148 142.28184775 115.48953396
15211612 182.9636348   52.25034904 286.86546595 242.7761079
07643676 146.37057691 125.90228716 137.64669731 102.12906943
294184   127.19797503 120.2095064  189.73221207 121.94777703
58871142 162.09127654 214.57935651 100.0105965  265.88447661
09032754 102.23950004 115.64512814 193.83264705 185.47519786
85928531 281.30774621  86.98229992 220.56270381 141.45606787
56547169 173.03077936  96.25903702 159.50774879  53.63613547
84468478 165.93698745 124.42552414 160.59146202 248.64622776
47508113 148.42042807 205.43430394  93.90192239 161.46649216
00134606 253.13344701  96.16425468 164.29821281 181.27263938
24275384 116.28152401 183.49639474 169.54560591 110.11920824
24602514  59.7852014  155.27187615 287.9524243  213.84063469
59836521 259.69744698 178.24353173 107.16945725 167.34950979
7571307  135.75101228 250.65478989 223.09415657]
   55
  195
  144
  200
  103
      ... 
  131
   70
   49
  277
  268
Name: Y, Length: 89, dtype: int64

Text(0.5, 1.0, 'Actual values vs predicted values')

../../_images/ef53d8bee238a22d5b6f849c1a4a71af1ffcb8e0352f856315e5f2ff2724d59a.png

from sklearn import metrics

print("Mean absolute error : ", metrics.mean_absolute_percentage_error(y_test, prediction))

print("Mean squared error :", metrics.mean_squared_error(y_test, prediction))

Mean absolute error :  0.3300922548696817
Mean squared error : 2228.090831059464

# Intercept and coefcient parameters

print(L_regressor.intercept_)

print("------------------------")
print(L_regressor.coef_)

# Map coeffients with features

coef_df = pd.DataFrame(L_regressor.coef_, X.columns, columns= ["Coeffcient"])
coef_df

-307.57104546505354
------------------------
[ 1.40608997e-01 -2.56128390e+01  5.08771783e+00  1.19293496e+00
 -1.08819176e+00  8.03227722e-01  4.24108443e-02  4.53744471e+00
  6.51304992e+01  3.71060693e-01]

	Coeffcient
AGE	0.140609
SEX	-25.612839
BMI	5.087718
BP	1.192935
S1	-1.088192
S2	0.803228
S3	0.042411
S4	4.537445
S5	65.130499
S6	0.371061

Logistic Regression#

import pandas as pd

dataset = pd.read_csv("/content/drive/MyDrive/Breast Cancer Data/Breast_cancer_data.csv")
print(dataset.head())
print("------------------------------------------------------------------------------")
print(dataset.columns)

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \
0  ...          17.33           184.60      2019.0            0.1622   
1  ...          23.41           158.80      1956.0            0.1238   
2  ...          25.53           152.50      1709.0            0.1444   
3  ...          26.50            98.87       567.7            0.2098   
4  ...          16.67           152.20      1575.0            0.1374   

   compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \
0             0.6656           0.7119                0.2654          0.4601   
1             0.1866           0.2416                0.1860          0.2750   
2             0.4245           0.4504                0.2430          0.3613   
3             0.8663           0.6869                0.2575          0.6638   
4             0.2050           0.4000                0.1625          0.2364   

   fractal_dimension_worst  Unnamed: 32  
0                  0.11890          NaN  
1                  0.08902          NaN  
2                  0.08758          NaN  
3                  0.17300          NaN  
4                  0.07678          NaN  

[5 rows x 33 columns]
------------------------------------------------------------------------------
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

dataset.T.head()

	0	1	2	3	4	5	6	7	8	9	...	559	560	561	562	563	564	565	566	567	568
id	842302	842517	84300903	84348301	84358402	843786	844359	84458202	844981	84501001	...	925291	925292	925311	925622	926125	926424	926682	926954	927241	92751
diagnosis	M	M	M	M	M	M	M	M	M	M	...	B	B	B	M	M	M	M	M	M	B
radius_mean	17.99	20.57	19.69	11.42	20.29	12.45	18.25	13.71	13.0	12.46	...	11.51	14.05	11.2	15.22	20.92	21.56	20.13	16.6	20.6	7.76
texture_mean	10.38	17.77	21.25	20.38	14.34	15.7	19.98	20.83	21.82	24.04	...	23.93	27.15	29.37	30.62	25.09	22.39	28.25	28.08	29.33	24.54
perimeter_mean	122.8	132.9	130.0	77.58	135.1	82.57	119.6	90.2	87.5	83.97	...	74.52	91.38	70.67	103.4	143.0	142.0	131.2	108.3	140.1	47.92

5 rows × 569 columns

# we can see dataset columns id and Unnamed:32 are not required , so we can drop those columns
dataset = dataset.drop(["id", "Unnamed: 32"], axis=1)
print(dataset.head())

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-20-926d8a94ccd3> in <module>()
      1 # we can see dataset columns id and Unnamed:32 are not required , so we can drop those columns
----> 2 dataset = dataset.drop(["id", "Unnamed: 32"], axis=1)
      3 print(dataset.head())

/usr/local/lib/python3.7/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4911             level=level,
   4912             inplace=inplace,
-> 4913             errors=errors,
   4914         )
   4915 

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4148         for axis, labels in axes.items():
   4149             if labels is not None:
-> 4150                 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   4151 
   4152         if inplace:

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
   4183                 new_axis = axis.drop(labels, level=level, errors=errors)
   4184             else:
-> 4185                 new_axis = axis.drop(labels, errors=errors)
   4186             result = self.reindex(**{axis_name: new_axis})
   4187 

/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
   6015         if mask.any():
   6016             if errors != "ignore":
-> 6017                 raise KeyError(f"{labels[mask]} not found in axis")
   6018             indexer = indexer[~mask]
   6019         return self.delete(indexer)

KeyError: "['id' 'Unnamed: 32'] not found in axis"

print(dataset.describe().T)
print("----------------------------------------------------------------------------------------------")
print(dataset.info())

                         count        mean         std         min  \
radius_mean              569.0   14.127292    3.524049    6.981000   
texture_mean             569.0   19.289649    4.301036    9.710000   
perimeter_mean           569.0   91.969033   24.298981   43.790000   
area_mean                569.0  654.889104  351.914129  143.500000   
smoothness_mean          569.0    0.096360    0.014064    0.052630   
compactness_mean         569.0    0.104341    0.052813    0.019380   
concavity_mean           569.0    0.088799    0.079720    0.000000   
concave points_mean      569.0    0.048919    0.038803    0.000000   
symmetry_mean            569.0    0.181162    0.027414    0.106000   
fractal_dimension_mean   569.0    0.062798    0.007060    0.049960   
radius_se                569.0    0.405172    0.277313    0.111500   
texture_se               569.0    1.216853    0.551648    0.360200   
perimeter_se             569.0    2.866059    2.021855    0.757000   
area_se                  569.0   40.337079   45.491006    6.802000   
smoothness_se            569.0    0.007041    0.003003    0.001713   
compactness_se           569.0    0.025478    0.017908    0.002252   
concavity_se             569.0    0.031894    0.030186    0.000000   
concave points_se        569.0    0.011796    0.006170    0.000000   
symmetry_se              569.0    0.020542    0.008266    0.007882   
fractal_dimension_se     569.0    0.003795    0.002646    0.000895   
radius_worst             569.0   16.269190    4.833242    7.930000   
texture_worst            569.0   25.677223    6.146258   12.020000   
perimeter_worst          569.0  107.261213   33.602542   50.410000   
area_worst               569.0  880.583128  569.356993  185.200000   
smoothness_worst         569.0    0.132369    0.022832    0.071170   
compactness_worst        569.0    0.254265    0.157336    0.027290   
concavity_worst          569.0    0.272188    0.208624    0.000000   
concave points_worst     569.0    0.114606    0.065732    0.000000   
symmetry_worst           569.0    0.290076    0.061867    0.156500   
fractal_dimension_worst  569.0    0.083946    0.018061    0.055040   

                                25%         50%          75%         max  
radius_mean               11.700000   13.370000    15.780000    28.11000  
texture_mean              16.170000   18.840000    21.800000    39.28000  
perimeter_mean            75.170000   86.240000   104.100000   188.50000  
area_mean                420.300000  551.100000   782.700000  2501.00000  
smoothness_mean            0.086370    0.095870     0.105300     0.16340  
compactness_mean           0.064920    0.092630     0.130400     0.34540  
concavity_mean             0.029560    0.061540     0.130700     0.42680  
concave points_mean        0.020310    0.033500     0.074000     0.20120  
symmetry_mean              0.161900    0.179200     0.195700     0.30400  
fractal_dimension_mean     0.057700    0.061540     0.066120     0.09744  
radius_se                  0.232400    0.324200     0.478900     2.87300  
texture_se                 0.833900    1.108000     1.474000     4.88500  
perimeter_se               1.606000    2.287000     3.357000    21.98000  
area_se                   17.850000   24.530000    45.190000   542.20000  
smoothness_se              0.005169    0.006380     0.008146     0.03113  
compactness_se             0.013080    0.020450     0.032450     0.13540  
concavity_se               0.015090    0.025890     0.042050     0.39600  
concave points_se          0.007638    0.010930     0.014710     0.05279  
symmetry_se                0.015160    0.018730     0.023480     0.07895  
fractal_dimension_se       0.002248    0.003187     0.004558     0.02984  
radius_worst              13.010000   14.970000    18.790000    36.04000  
texture_worst             21.080000   25.410000    29.720000    49.54000  
perimeter_worst           84.110000   97.660000   125.400000   251.20000  
area_worst               515.300000  686.500000  1084.000000  4254.00000  
smoothness_worst           0.116600    0.131300     0.146000     0.22260  
compactness_worst          0.147200    0.211900     0.339100     1.05800  
concavity_worst            0.114500    0.226700     0.382900     1.25200  
concave points_worst       0.064930    0.099930     0.161400     0.29100  
symmetry_worst             0.250400    0.282200     0.317900     0.66380  
fractal_dimension_worst    0.071460    0.080040     0.092080     0.20750  
----------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  569 non-null    float64
 15  smoothness_se            569 non-null    float64
 16  compactness_se           569 non-null    float64
 17  concavity_se             569 non-null    float64
 18  concave points_se        569 non-null    float64
 19  symmetry_se              569 non-null    float64
 20  fractal_dimension_se     569 non-null    float64
 21  radius_worst             569 non-null    float64
 22  texture_worst            569 non-null    float64
 23  perimeter_worst          569 non-null    float64
 24  area_worst               569 non-null    float64
 25  smoothness_worst         569 non-null    float64
 26  compactness_worst        569 non-null    float64
 27  concavity_worst          569 non-null    float64
 28  concave points_worst     569 non-null    float64
 29  symmetry_worst           569 non-null    float64
 30  fractal_dimension_worst  569 non-null    float64
dtypes: float64(30), object(1)
memory usage: 137.9+ KB
None

print(dataset.diagnosis.value_counts())
print(dataset.iloc[:,0])
print(dataset.head(2))

B    357
M    212
Name: diagnosis, dtype: int64
0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: diagnosis, Length: 569, dtype: object
  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0         M        17.99         10.38           122.8     1001.0   
1         M        20.57         17.77           132.9     1326.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2419  ...         25.38          17.33            184.6   
1         0.1812  ...         24.99          23.41            158.8   

   area_worst  smoothness_worst  compactness_worst  concavity_worst  \
0      2019.0            0.1622             0.6656           0.7119   
1      1956.0            0.1238             0.1866           0.2416   

   concave points_worst  symmetry_worst  fractal_dimension_worst  
0                0.2654          0.4601                  0.11890  
1                0.1860          0.2750                  0.08902  

[2 rows x 31 columns]

X = dataset.iloc[:, 1:]
print(X)
y= dataset.iloc[:, 0]
print(y)

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
        17.99         10.38          122.80     1001.0          0.11840   
        20.57         17.77          132.90     1326.0          0.08474   
        19.69         21.25          130.00     1203.0          0.10960   
        11.42         20.38           77.58      386.1          0.14250   
        20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
      21.56         22.39          142.00     1479.0          0.11100   
      20.13         28.25          131.20     1261.0          0.09780   
      16.60         28.08          108.30      858.1          0.08455   
      20.60         29.33          140.10     1265.0          0.11780   
       7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
           0.27760         0.30010              0.14710         0.2419   
           0.07864         0.08690              0.07017         0.1812   
           0.15990         0.19740              0.12790         0.2069   
           0.28390         0.24140              0.10520         0.2597   
           0.13280         0.19800              0.10430         0.1809   
..                ...             ...                  ...            ...   
         0.11590         0.24390              0.13890         0.1726   
         0.10340         0.14400              0.09791         0.1752   
         0.10230         0.09251              0.05302         0.1590   
         0.27700         0.35140              0.15200         0.2397   
         0.04362         0.00000              0.00000         0.1587   

     fractal_dimension_mean  ...  radius_worst  texture_worst  \
                 0.07871  ...        25.380          17.33   
                 0.05667  ...        24.990          23.41   
                 0.05999  ...        23.570          25.53   
                 0.09744  ...        14.910          26.50   
                 0.05883  ...        22.540          16.67   
..                      ...  ...           ...            ...   
               0.05623  ...        25.450          26.40   
               0.05533  ...        23.690          38.25   
               0.05648  ...        18.980          34.12   
               0.07016  ...        25.740          39.42   
               0.05884  ...         9.456          30.37   

     perimeter_worst  area_worst  smoothness_worst  compactness_worst  \
           184.60      2019.0           0.16220            0.66560   
           158.80      1956.0           0.12380            0.18660   
           152.50      1709.0           0.14440            0.42450   
            98.87       567.7           0.20980            0.86630   
           152.20      1575.0           0.13740            0.20500   
..               ...         ...               ...                ...   
         166.10      2027.0           0.14100            0.21130   
         155.00      1731.0           0.11660            0.19220   
         126.70      1124.0           0.11390            0.30940   
         184.60      1821.0           0.16500            0.86810   
          59.16       268.6           0.08996            0.06444   

     concavity_worst  concave points_worst  symmetry_worst  \
           0.7119                0.2654          0.4601   
           0.2416                0.1860          0.2750   
           0.4504                0.2430          0.3613   
           0.6869                0.2575          0.6638   
           0.4000                0.1625          0.2364   
..               ...                   ...             ...   
         0.4107                0.2216          0.2060   
         0.3215                0.1628          0.2572   
         0.3403                0.1418          0.2218   
         0.9387                0.2650          0.4087   
         0.0000                0.0000          0.2871   

     fractal_dimension_worst  
                  0.11890  
                  0.08902  
                  0.08758  
                  0.17300  
                  0.07678  
..                       ...  
                0.07115  
                0.06637  
                0.07820  
                0.12400  
                0.07039  

[569 rows x 30 columns]
    M
    M
    M
    M
    M
      ..
  M
  M
  M
  M
  B
Name: diagnosis, Length: 569, dtype: object

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_norm = sc.fit_transform(X)
print(X_norm)

[[ 1.09706398 -2.07333501  1.26993369 ...  2.29607613  2.75062224
   1.93701461]
 [ 1.82982061 -0.35363241  1.68595471 ...  1.0870843  -0.24388967
   0.28118999]
 [ 1.57988811  0.45618695  1.56650313 ...  1.95500035  1.152255
   0.20139121]
 ...
 [ 0.70228425  2.0455738   0.67267578 ...  0.41406869 -1.10454895
  -0.31840916]
 [ 1.83834103  2.33645719  1.98252415 ...  2.28998549  1.91908301
   2.21963528]
 [-1.80840125  1.22179204 -1.81438851 ... -1.74506282 -0.04813821
  -0.75120669]]

# split normalized data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_norm,y, test_size= 0.2, random_state = 101)

# initialize the classifier
from sklearn.linear_model import LogisticRegression
Log_classifier =  LogisticRegression(random_state=0)
Log_classifier.fit(X_train, y_train)

#----
y_pred = Log_classifier.predict(X_test)
print(y_pred)

# compare y_pred vs actual y_test

print("-----------------------------------------------------------------")
print(y_test)

['B' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'B'
 'M' 'M' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'M' 'B' 'M' 'B' 'M' 'B' 'B'
 'M' 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'B' 'M'
 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'M' 'M' 'B' 'M' 'B'
 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'M'
 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'M']
-----------------------------------------------------------------
107    B
437    B
195    B
141    M
319    B
      ..
19     B
313    B
139    B
495    B
317    M
Name: diagnosis, Length: 114, dtype: object

# Model evaluation

from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix( y_test, y_pred)
print(cm)
print("---------------------------------------------------------------------------")
print("classification report \n", classification_report(y_test, y_pred))

[[72  0]
 [ 1 41]]
---------------------------------------------------------------------------
classification report 
               precision    recall  f1-score   support

           B       0.99      1.00      0.99        72
           M       1.00      0.98      0.99        42

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

# --- Getting informative features by getting coeffcients

coeff_df = pd.DataFrame(Log_classifier.coef_.T, X.columns,
columns=['Coefficient'])
coeff_df =coeff_df.sort_values(by=['Coefficient'],ascending=False)
coeff_df

#  the features “radius_se” and “texture_worst” have a major contribution to determining the malignancy

	Coefficient
radius_se	1.379673
texture_worst	1.169080
concave points_mean	1.156939
area_se	0.984579
concave points_worst	0.933089
radius_worst	0.903471
smoothness_worst	0.872462
area_worst	0.846826
concavity_mean	0.807876
concavity_worst	0.802358
perimeter_se	0.637758
fractal_dimension_worst	0.618413
perimeter_worst	0.568124
symmetry_worst	0.562905
texture_mean	0.387553
area_mean	0.314432
concave points_se	0.267092
radius_mean	0.266278
perimeter_mean	0.232904
compactness_worst	0.052849
symmetry_mean	0.042495
smoothness_mean	-0.036783
symmetry_se	-0.042753
smoothness_se	-0.135624
texture_se	-0.153152
concavity_se	-0.224247
fractal_dimension_mean	-0.378132
compactness_mean	-0.590002
fractal_dimension_se	-0.697663
compactness_se	-0.780794

fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10,4))
sns.boxplot(x="radius_se", y="diagnosis", data=dataset,ax=axes[0])
sns.boxplot(x='texture_worst',y='diagnosis',
data=dataset,ax=axes[1])
fig.tight_layout()

../../_images/7b0deb8d25875f2328051c1eed08567d14d813fb4383aefa95993bc54cb33333.png

Decision Trees and Random Forest#

# Decision tree is based on whole dataset,
# Where as Random FOrest incidently selects features to create different decsion trees

#--- Tree Pruning --- constructing too many decision trees leads to overfitting , so for real world data we have to restrict decison tree by keeping little impurities tolerable is called tree pruining

# Random forest trains multiple decision trees on sample data set and samples are selected randomly with replacements (Bootstrap sampling)
# Random forest model ~= Ensemble Methods

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv("/content/drive/MyDrive/Breast Cancer Data/Breast_cancer_data.csv")
print(dataset.head())
print("------------------------------------------------------------------------------")
print(dataset.columns)

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \
0  ...          17.33           184.60      2019.0            0.1622   
1  ...          23.41           158.80      1956.0            0.1238   
2  ...          25.53           152.50      1709.0            0.1444   
3  ...          26.50            98.87       567.7            0.2098   
4  ...          16.67           152.20      1575.0            0.1374   

   compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \
0             0.6656           0.7119                0.2654          0.4601   
1             0.1866           0.2416                0.1860          0.2750   
2             0.4245           0.4504                0.2430          0.3613   
3             0.8663           0.6869                0.2575          0.6638   
4             0.2050           0.4000                0.1625          0.2364   

   fractal_dimension_worst  Unnamed: 32  
0                  0.11890          NaN  
1                  0.08902          NaN  
2                  0.08758          NaN  
3                  0.17300          NaN  
4                  0.07678          NaN  

[5 rows x 33 columns]
------------------------------------------------------------------------------
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

# dataset = dataset.drop(["id", 'Unnamed: 32'], axis=1)
dataset.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

#--- Train test split

X = dataset.iloc[:, 1:]
y = dataset.iloc[:, 0]
print(X)
print(":=========================:")
print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
print(X_train, X_test)

# Decision tree call and train on data

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier()

# prediction and evaluation

preds = classifier.predict(X_test)

# Evaluation

from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           B       0.94      0.88      0.91       105
           M       0.82      0.91      0.86        66

    accuracy                           0.89       171
   macro avg       0.88      0.89      0.88       171
weighted avg       0.89      0.89      0.89       171

# random forest from ensemble methods

from sklearn.ensemble import RandomForestClassifier

rfc_classifier = RandomForestClassifier(n_estimators= 300)

rfc_classifier.fit(X_train, y_train)

rfc_pred = rfc_classifier.predict(X_test)
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

           B       0.99      0.96      0.98       105
           M       0.94      0.98      0.96        66

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171

#Cross validation and prediction

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100)
cross_validator = cross_validate(classifier, X, y, cv=5,scoring = 'accuracy',return_estimator = 'true')
print(cross_validator['test_score'])

[0.93859649 0.94736842 0.99122807 0.97368421 0.98230088]

# so highest value from above is best model .
# here best model is 0.00122807

best_model = cross_validator["estimator"][-1]
best_model

rfc_best_pred = best_model.predict(X_test)
print(classification_report(y_test, rfc_best_pred))

              precision    recall  f1-score   support

           B       1.00      0.99      1.00       105
           M       0.99      1.00      0.99        66

    accuracy                           0.99       171
   macro avg       0.99      1.00      0.99       171
weighted avg       0.99      0.99      0.99       171

Support vector Machines (SVMs)#

# Support vector machines are linear classifier , they make linear decision boundaries
# SVMs goal is to find the Hyperplane or decision boundaries , which best fit and seperates n dimensional into classes .
# if suppose one dimensional data is not having linear seperable (as of real world data), then we use kernel trick to ass another feature (dimension) there by separating points or data
# Radiab basis function (RBF ) is an example

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#--- Wheat seed Dataset

dataset = pd.read_csv("/content/drive/MyDrive/Wheat_seed_dataset/wheat_seeds.csv")
print(dataset.info())
dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             199 non-null    float64
 1   Perimeter        199 non-null    float64
 2   Compactness      199 non-null    float64
 3   Kernel.Length    199 non-null    float64
 4   Kernel.Width     199 non-null    float64
 5   Asymmetry.Coeff  199 non-null    float64
 6   Kernel.Groove    199 non-null    float64
 7   Type             199 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 12.6 KB
None

	Area	Perimeter	Compactness	Kernel.Length	Kernel.Width	Asymmetry.Coeff	Kernel.Groove	Type
0	15.26	14.84	0.8710	5.763	3.312	2.221	5.220	1
1	14.88	14.57	0.8811	5.554	3.333	1.018	4.956	1
2	14.29	14.09	0.9050	5.291	3.337	2.699	4.825	1
3	13.84	13.94	0.8955	5.324	3.379	2.259	4.805	1
4	16.14	14.99	0.9034	5.658	3.562	1.355	5.175	1

dataset["Type"].value_counts()

  68
  66
  65
Name: Type, dtype: int64

dataset.isnull().sum()

Area               0
Perimeter          0
Compactness        0
Kernel.Length      0
Kernel.Width       0
Asymmetry.Coeff    0
Kernel.Groove      0
Type               0
dtype: int64

# Split data Train Test split

X = dataset.iloc[:, 0:-1]
print(X)
y= dataset.iloc[:, -1]
print(y)

from sklearn.model_selection import train_test_split
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

print(X_train)
print(X_test)

      Area  Perimeter  Compactness  Kernel.Length  Kernel.Width  \
  15.26      14.84       0.8710          5.763         3.312   
  14.88      14.57       0.8811          5.554         3.333   
  14.29      14.09       0.9050          5.291         3.337   
  13.84      13.94       0.8955          5.324         3.379   
  16.14      14.99       0.9034          5.658         3.562   
..     ...        ...          ...            ...           ...   
12.19      13.20       0.8783          5.137         2.981   
11.23      12.88       0.8511          5.140         2.795   
13.20      13.66       0.8883          5.236         3.232   
11.84      13.21       0.8521          5.175         2.836   
12.30      13.34       0.8684          5.243         2.974   

     Asymmetry.Coeff  Kernel.Groove  
            2.221          5.220  
            1.018          4.956  
            2.699          4.825  
            2.259          4.805  
            1.355          5.175  
..               ...            ...  
          3.631          4.870  
          4.325          5.003  
          8.315          5.056  
          3.598          5.044  
          5.637          5.063  

[199 rows x 7 columns]
    1
    1
    1
    1
    1
      ..
  3
  3
  3
  3
  3
Name: Type, Length: 199, dtype: int64
      Area  Perimeter  Compactness  Kernel.Length  Kernel.Width  \
 16.19      15.16       0.8849          5.833         3.421   
 13.80      14.04       0.8794          5.376         3.155   
18.83      16.29       0.8917          6.037         3.786   
12.70      13.71       0.8491          5.386         2.911   
 15.88      14.90       0.8988          5.618         3.507   
..     ...        ...          ...            ...           ...   
18.89      16.23       0.9008          6.227         3.769   
 14.01      14.29       0.8625          5.609         3.158   
 13.99      13.83       0.9183          5.119         3.383   
18.30      15.89       0.9108          5.979         3.755   
  15.26      14.85       0.8696          5.714         3.242   

     Asymmetry.Coeff  Kernel.Groove  
          0.9030          5.307  
          1.5600          4.961  
         2.5530          5.879  
         3.2600          5.316  
          0.7651          5.091  
..               ...            ...  
         3.6390          5.966  
          2.2170          5.132  
          5.2340          4.781  
         2.8370          5.962  
           4.5430          5.314  

[149 rows x 7 columns]
      Area  Perimeter  Compactness  Kernel.Length  Kernel.Width  \
 12.36      13.19       0.8923          5.076         3.042   
  14.38      14.21       0.8951          5.386         3.312   
 14.11      14.26       0.8722          5.520         3.168   
17.55      15.66       0.8991          5.791         3.690   
 14.52      14.60       0.8557          5.741         3.113   
 14.16      14.40       0.8584          5.658         3.129   
12.15      13.45       0.8443          5.417         2.837   
 14.92      14.43       0.9006          5.384         3.412   
 19.11      16.26       0.9081          6.154         3.930   
  14.29      14.09       0.9050          5.291         3.337   
 19.46      16.50       0.8985          6.113         3.892   
 14.03      14.16       0.8796          5.438         3.201   
 17.12      15.55       0.8892          5.850         3.566   
13.07      13.92       0.8480          5.472         2.994   
12.37      13.47       0.8567          5.204         2.960   
 14.34      14.37       0.8726          5.630         3.190   
19.06      16.45       0.8854          6.416         3.719   
 20.20      16.89       0.8894          6.285         3.864   
10.91      12.80       0.8372          5.088         2.675   
20.03      16.90       0.8811          6.493         3.857   
11.19      13.05       0.8253          5.250         2.675   
15.56      14.89       0.8823          5.776         3.408   
 12.78      13.57       0.8716          5.262         3.026   
 18.76      16.20       0.8984          6.172         3.796   
17.63      15.86       0.8800          6.033         3.573   
18.75      16.18       0.8999          6.111         3.869   
10.74      12.73       0.8329          5.145         2.642   
  14.88      14.57       0.8811          5.554         3.333   
 14.86      14.67       0.8676          5.678         3.258   
11.41      12.95       0.8560          5.090         2.775   
12.11      13.27       0.8639          5.236         2.975   
 13.50      13.85       0.8852          5.351         3.158   
 16.53      15.34       0.8823          5.875         3.467   
 16.87      15.65       0.8648          6.139         3.463   
 17.08      15.38       0.9079          5.832         3.683   
18.65      16.41       0.8698          6.285         3.594   
11.48      13.05       0.8473          5.180         2.758   
11.35      13.12       0.8291          5.176         2.668   
 14.79      14.52       0.8819          5.545         3.291   
12.19      13.36       0.8579          5.240         2.909   
12.62      13.67       0.8481          5.410         2.911   
  16.63      15.46       0.8747          6.053         3.465   
 12.74      13.67       0.8564          5.395         2.956   
11.36      13.05       0.8382          5.175         2.755   
 11.23      12.63       0.8840          4.902         2.879   
 17.32      15.91       0.8599          6.064         3.403   
19.94      16.92       0.8752          6.675         3.763   
12.30      13.34       0.8684          5.243         2.974   
 15.38      14.77       0.8857          5.662         3.419   
10.82      12.83       0.8256          5.180         2.630   

     Asymmetry.Coeff  Kernel.Groove  
           3.220          4.605  
            2.462          4.956  
           2.688          5.219  
          5.366          5.661  
           1.481          5.487  
           3.072          5.176  
          3.638          5.338  
           1.142          5.088  
           2.936          6.079  
            2.699          4.825  
           4.308          6.009  
           1.717          5.001  
           2.858          5.746  
          5.304          5.395  
          3.919          5.001  
           1.313          5.150  
          2.248          6.163  
           5.173          6.187  
          4.179          4.956  
          3.063          6.320  
          5.813          5.219  
          4.972          5.847  
           1.176          4.782  
           3.120          6.053  
          3.747          5.929  
          4.188          5.992  
          4.702          4.963  
            1.018          4.956  
           2.129          5.351  
          4.957          4.825  
          4.132          5.012  
           2.249          5.176  
           5.532          5.880  
           3.696          5.967  
           2.956          5.484  
          4.391          6.102  
          5.876          5.002  
          4.337          5.132  
           2.704          5.111  
          4.857          5.158  
          3.306          5.231  
            2.040          5.877  
           2.504          4.869  
          4.048          5.263  
           2.269          4.703  
           3.824          5.922  
          3.252          6.550  
          5.637          5.063  
           1.999          5.222  
          4.853          5.089  

# Training SVM classifier and Tuning hyperparameters using GridsearchCV

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

#--- define parameters set using dictionary

param = { "C" : [1.0, 10, 100, 100] , "gamma" : [1, 0.1, 0.01, 0.001, 10], "kernel" : ["rbf"]}

grid = GridSearchCV(SVC(), param_grid= param, refit= True, verbose= 3)

grid.fit(X_train, y_train)

# best grid search param

best_param = grid.best_params_
print(best_param)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}

grid_prediction = grid.predict(X_test)
grid_prediction


# report
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, grid_prediction))
print("===============================")
print(classification_report(y_test, grid_prediction))

[[18  1  0]
 [ 0 16  0]
 [ 0  0 15]]
===============================
              precision    recall  f1-score   support

           1       1.00      0.95      0.97        19
           2       0.94      1.00      0.97        16
           3       1.00      1.00      1.00        15

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50

Neural Network and Deep learning#