AI Concepts#

In this we are going to learn some AI concepts taking general and medical examples

Principle component Analysis#

# principle component analysis is all about preserving Variance as much as we can , because variance is informative

# Variance is how far a varible value is from mean of dataset, variables are features .. so we reduce the features by preserving variance

# eigen matrix (which has eigen values and eigen vectors) multiplied by X raw features = to get new features

# new features are not the subset of raw features

# lets understand by using iris dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

iris = sns.load_dataset("iris")
iris.head()
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
iris.describe()
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
X = iris.iloc[:, 0:4]
print(X)
Y = iris.iloc[:, 4]
print(Y)

# Standardization , where mean = 0 and sd =1
iris_std = (X-X.mean())/ X.std()
print(iris_std)

# concatenate Y
iris_std = pd.concat([iris_std, Y], axis=1)
print(iris_std)
     sepal_length  sepal_width  petal_length  petal_width
0             5.1          3.5           1.4          0.2
1             4.9          3.0           1.4          0.2
2             4.7          3.2           1.3          0.2
3             4.6          3.1           1.5          0.2
4             5.0          3.6           1.4          0.2
..            ...          ...           ...          ...
145           6.7          3.0           5.2          2.3
146           6.3          2.5           5.0          1.9
147           6.5          3.0           5.2          2.0
148           6.2          3.4           5.4          2.3
149           5.9          3.0           5.1          1.8

[150 rows x 4 columns]
0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object
     sepal_length  sepal_width  petal_length  petal_width
0       -0.897674     1.015602     -1.335752    -1.311052
1       -1.139200    -0.131539     -1.335752    -1.311052
2       -1.380727     0.327318     -1.392399    -1.311052
3       -1.501490     0.097889     -1.279104    -1.311052
4       -1.018437     1.245030     -1.335752    -1.311052
..            ...          ...           ...          ...
145      1.034539    -0.131539      0.816859     1.443994
146      0.551486    -1.278680      0.703564     0.919223
147      0.793012    -0.131539      0.816859     1.050416
148      0.430722     0.786174      0.930154     1.443994
149      0.068433    -0.131539      0.760211     0.788031

[150 rows x 4 columns]
     sepal_length  sepal_width  petal_length  petal_width    species
0       -0.897674     1.015602     -1.335752    -1.311052     setosa
1       -1.139200    -0.131539     -1.335752    -1.311052     setosa
2       -1.380727     0.327318     -1.392399    -1.311052     setosa
3       -1.501490     0.097889     -1.279104    -1.311052     setosa
4       -1.018437     1.245030     -1.335752    -1.311052     setosa
..            ...          ...           ...          ...        ...
145      1.034539    -0.131539      0.816859     1.443994  virginica
146      0.551486    -1.278680      0.703564     0.919223  virginica
147      0.793012    -0.131539      0.816859     1.050416  virginica
148      0.430722     0.786174      0.930154     1.443994  virginica
149      0.068433    -0.131539      0.760211     0.788031  virginica

[150 rows x 5 columns]
print(iris.describe())
print(iris_std.describe())
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000
       sepal_length   sepal_width  petal_length   petal_width
count  1.500000e+02  1.500000e+02  1.500000e+02  1.500000e+02
mean  -5.684342e-16 -7.815970e-16 -2.842171e-16 -3.789561e-16
std    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00
min   -1.863780e+00 -2.425820e+00 -1.562342e+00 -1.442245e+00
25%   -8.976739e-01 -5.903951e-01 -1.222456e+00 -1.179859e+00
50%   -5.233076e-02 -1.315388e-01  3.353541e-01  1.320673e-01
75%    6.722490e-01  5.567457e-01  7.602115e-01  7.880307e-01
max    2.483699e+00  3.080455e+00  1.779869e+00  1.706379e+00
# Lets see by standaridization whether we preserved the information or variance by plotting

print("Plots showed distribution and scatter preserved , No change , where as data standardized at center zero" )
print("---------------------------------------------------------------------------------------------------------")
fig, axes = plt.subplots(nrows= 2, ncols= 2, figsize = (20, 10))
print(axes)

axes_0 = sns.distplot(iris['petal_length'], hist= True, kde= True, ax = axes[0,0])
axes_0.set_title("raw iris petal length distribution plot")

axes_1 = sns.distplot(iris_std['petal_length'], hist= True, kde= True, ax= axes[0,1])
axes_1.set_title("standaridized iris petal length distribution plot")

axes_2 = sns.scatterplot(x= "petal_length", y= "petal_width" , data= iris, hue= 'species', ax = axes[1,0])
axes_2.set_title(" raw iris scatter plot")

axes_2 = sns.scatterplot(x= "petal_length", y= "petal_width", data= iris_std, hue= 'species', ax = axes[1,1])
axes_2.set_title(" standardized iris scatter plot")

Plots showed distribution and scatter preserved , No change , where as data standardized at center zero
---------------------------------------------------------------------------------------------------------
[[<matplotlib.axes._subplots.AxesSubplot object at 0x7f3d5bf2ae50>
  <matplotlib.axes._subplots.AxesSubplot object at 0x7f3d5be46790>]
 [<matplotlib.axes._subplots.AxesSubplot object at 0x7f3d5bdfdd50>
  <matplotlib.axes._subplots.AxesSubplot object at 0x7f3d5bdc3250>]]
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Text(0.5, 1.0, ' standardized iris scatter plot')
../../_images/4b9860dd4c3665d31251bf92d046fcd6760008c18a3a3f05658dc4bfe169732d.png
# Now data is standardized , next we can perform
# ---- preparing correlation matrix using numpy
#------ finding eigen vector and eigen values using matrix linalg.eig
# ----

X_std =  iris_std.iloc[:, 0:4]
print(X_std)

y = iris_std.iloc[:,4]
print(y)
     sepal_length  sepal_width  petal_length  petal_width
0       -0.897674     1.015602     -1.335752    -1.311052
1       -1.139200    -0.131539     -1.335752    -1.311052
2       -1.380727     0.327318     -1.392399    -1.311052
3       -1.501490     0.097889     -1.279104    -1.311052
4       -1.018437     1.245030     -1.335752    -1.311052
..            ...          ...           ...          ...
145      1.034539    -0.131539      0.816859     1.443994
146      0.551486    -1.278680      0.703564     0.919223
147      0.793012    -0.131539      0.816859     1.050416
148      0.430722     0.786174      0.930154     1.443994
149      0.068433    -0.131539      0.760211     0.788031

[150 rows x 4 columns]
0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object
from sklearn.decomposition import PCA

sklearn_pca = PCA(n_components= 2, random_state= 100)
PCs = sklearn_pca.fit_transform(X_std)
print(PCs.shape)
print(X_std.shape)
print(PCs)
(150, 2)
(150, 4)
[[-2.25714118  0.47842383]
 [-2.07401302 -0.67188269]
 [-2.35633511 -0.34076642]
 [-2.29170679 -0.59539986]
 [-2.3818627   0.64467566]
 [-2.06870061  1.4842053 ]
 [-2.43586845  0.04748512]
 [-2.22539189  0.222403  ]
 [-2.32684533 -1.1116037 ]
 [-2.17703491 -0.46744757]
 [-2.15907699  1.04020587]
 [-2.31836413  0.132634  ]
 [-2.2110437  -0.72624318]
 [-2.62430902 -0.95829635]
 [-2.19139921  1.85384655]
 [-2.25466121  2.67731523]
 [-2.20021676  1.47865573]
 [-2.18303613  0.48720613]
 [-1.89223284  1.40032757]
 [-2.33554476  1.1240836 ]
 [-1.90793125  0.40749058]
 [-2.19964383  0.92103587]
 [-2.76508142  0.4568133 ]
 [-1.81259716  0.08527285]
 [-2.21972701  0.13679618]
 [-1.9453293  -0.62352971]
 [-2.04430277  0.24135499]
 [-2.1613365   0.52538942]
 [-2.13241965  0.312172  ]
 [-2.25769799 -0.33660425]
 [-2.13297647 -0.50285608]
 [-1.82547925  0.42228039]
 [-2.60621687  1.78758727]
 [-2.43800983  2.1435468 ]
 [-2.10292986 -0.45866527]
 [-2.20043723 -0.20541922]
 [-2.03831765  0.65934923]
 [-2.51889339  0.59031516]
 [-2.42152026 -0.90116107]
 [-2.16246625  0.2679812 ]
 [-2.27884081  0.44024054]
 [-1.85191836 -2.32961074]
 [-2.54511203 -0.47750102]
 [-1.95788857  0.47074961]
 [-2.12992356  1.13841546]
 [-2.06283361 -0.70867859]
 [-2.37677076  1.11668869]
 [-2.38638171 -0.38495723]
 [-2.22200263  0.99462767]
 [-2.19647504  0.00918558]
 [ 1.09810244  0.86009103]
 [ 0.72889556  0.59262936]
 [ 1.2368358   0.61423989]
 [ 0.40612251 -1.7485462 ]
 [ 1.07188379 -0.20772515]
 [ 0.38738955 -0.59130272]
 [ 0.74403715  0.77043827]
 [-0.48569562 -1.846244  ]
 [ 0.92480346  0.03211848]
 [ 0.01138804 -1.03056578]
 [-0.10982834 -2.64521111]
 [ 0.43922201 -0.06308385]
 [ 0.56023148 -1.75883213]
 [ 0.71715934 -0.18560282]
 [-0.03324333 -0.43753742]
 [ 0.87248429  0.50736424]
 [ 0.34908221 -0.19565627]
 [ 0.1582798  -0.78945101]
 [ 1.22100316 -1.61682728]
 [ 0.16436725 -1.29825994]
 [ 0.73521959  0.39524745]
 [ 0.47469691 -0.41592689]
 [ 1.23005729 -0.93020944]
 [ 0.63074514 -0.41499744]
 [ 0.70031506 -0.06320009]
 [ 0.87135454  0.24995602]
 [ 1.25231375 -0.07699807]
 [ 1.35386953  0.33020546]
 [ 0.66258066 -0.2251735 ]
 [-0.04012419 -1.05518358]
 [ 0.13035846 -1.55705555]
 [ 0.02337438 -1.56722524]
 [ 0.2407318  -0.7746612 ]
 [ 1.05755171 -0.6317269 ]
 [ 0.22323093 -0.28681266]
 [ 0.42770626  0.84275892]
 [ 1.04522645  0.52030871]
 [ 1.04104379 -1.37837105]
 [ 0.06935597 -0.21877043]
 [ 0.28253073 -1.32488615]
 [ 0.27814596 -1.11628885]
 [ 0.62248441  0.02483981]
 [ 0.33540673 -0.98510383]
 [-0.36097409 -2.01249582]
 [ 0.28762268 -0.85287312]
 [ 0.09105561 -0.18058714]
 [ 0.22695654 -0.38363487]
 [ 0.57446378 -0.15435649]
 [-0.4461723  -1.53863746]
 [ 0.25587339 -0.59685229]
 [ 1.83841002  0.86751506]
 [ 1.15401555 -0.6965364 ]
 [ 2.19790361  0.56013398]
 [ 1.43534213 -0.0468307 ]
 [ 1.86157577  0.2940597 ]
 [ 2.74268509  0.79773671]
 [ 0.36579225 -1.55628918]
 [ 2.29475181  0.41866302]
 [ 1.99998633 -0.70906323]
 [ 2.25223216  1.9145963 ]
 [ 1.35962064  0.6904434 ]
 [ 1.59732747 -0.42029243]
 [ 1.87761053  0.41784981]
 [ 1.25590769 -1.15837974]
 [ 1.46274487 -0.44079488]
 [ 1.5847682   0.67398689]
 [ 1.46651849  0.25476833]
 [ 2.4182277   2.5481248 ]
 [ 3.29964148  0.01772158]
 [ 1.25954707 -1.70104672]
 [ 2.03091256  0.90742744]
 [ 0.97471535 -0.56985526]
 [ 2.8879765   0.41225995]
 [ 1.32878064 -0.4802025 ]
 [ 1.6950553   1.01053648]
 [ 1.94780139  1.00441272]
 [ 1.17118007 -0.31533806]
 [ 1.01754169  0.06413118]
 [ 1.78237879 -0.18673563]
 [ 1.85742501  0.56041329]
 [ 2.4278203   0.25841871]
 [ 2.29723178  2.61755442]
 [ 1.85648383 -0.17795333]
 [ 1.1104277  -0.29194458]
 [ 1.19845835 -0.80860636]
 [ 2.78942561  0.85394254]
 [ 1.57099294  1.06501321]
 [ 1.34179696  0.42102015]
 [ 0.92173701  0.01716559]
 [ 1.84586124  0.67387065]
 [ 2.00808316  0.61183593]
 [ 1.89543421  0.68727307]
 [ 1.15401555 -0.6965364 ]
 [ 2.03374499  0.86462403]
 [ 1.99147547  1.04566567]
 [ 1.86425786  0.38567404]
 [ 1.55935649 -0.89369285]
 [ 1.51609145  0.26817075]
 [ 1.36820418  1.00787793]
 [ 0.95744849 -0.02425043]]
iris_transform_df = pd.DataFrame(PCs, columns=("PC1", "PC2"))
iris_transform_df = pd.concat([iris_transform_df, y], axis = 1)

iris_transform_df.head()


fig, ax = plt.subplots(figsize = (10,8))

ax_0 = sns.scatterplot("PC1", "PC2", data= iris_transform_df, hue= "species")
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
../../_images/ef7a66505eac46dcdfa6a0794807cf85468def4f486cd0c35f870ebc6626c962.png
# The art of enabling machine to form rules and find trends from data without explicitly programming them is known as Machine Learning

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
dataset = pd.read_csv("/content/drive/MyDrive/Diabetes/diabetes.txt", delimiter= "\t")
dataset.head()
AGE SEX BMI BP S1 S2 S3 S4 S5 S6 Y
0 59 2 32.1 101.0 157 93.2 38.0 4.0 4.8598 87 151
1 48 1 21.6 87.0 183 103.2 70.0 3.0 3.8918 69 75
2 72 2 30.5 93.0 156 93.6 41.0 4.0 4.6728 85 141
3 24 1 25.3 84.0 198 131.4 40.0 5.0 4.8903 89 206
4 50 1 23.0 101.0 192 125.4 52.0 4.0 4.2905 80 135
print(dataset.info())

print("----------------------------------------------------------------------")

print(dataset.isna().sum())
print("---------------------------------------------------------------------")

print(dataset.describe().T)
# Seperate the dataset into independent and depent features

X = dataset.iloc[:, :-1]
print(X)
print("-------------------------------------------------------------------------")
y= dataset.iloc[:, -1]
print(y)
     AGE  SEX   BMI      BP   S1     S2    S3    S4      S5   S6
0     59    2  32.1  101.00  157   93.2  38.0  4.00  4.8598   87
1     48    1  21.6   87.00  183  103.2  70.0  3.00  3.8918   69
2     72    2  30.5   93.00  156   93.6  41.0  4.00  4.6728   85
3     24    1  25.3   84.00  198  131.4  40.0  5.00  4.8903   89
4     50    1  23.0  101.00  192  125.4  52.0  4.00  4.2905   80
..   ...  ...   ...     ...  ...    ...   ...   ...     ...  ...
437   60    2  28.2  112.00  185  113.8  42.0  4.00  4.9836   93
438   47    2  24.9   75.00  225  166.0  42.0  5.00  4.4427  102
439   60    2  24.9   99.67  162  106.6  43.0  3.77  4.1271   95
440   36    1  30.0   95.00  201  125.2  42.0  4.79  5.1299   85
441   36    1  19.6   71.00  250  133.2  97.0  3.00  4.5951   92

[442 rows x 10 columns]
-------------------------------------------------------------------------
0      151
1       75
2      141
3      206
4      135
      ... 
437    178
438    104
439    132
440    220
441     57
Name: Y, Length: 442, dtype: int64
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 110)
print(X_train)
print(X_test)
print("-----------------------------------------------------")
print(y_train)
print(y_test)
     AGE  SEX   BMI      BP   S1     S2    S3    S4      S5   S6
96    64    2  27.3  109.00  186  107.6  38.0  5.00  5.3083   99
333   56    2  25.8  103.00  177  114.4  34.0  5.00  4.9628   99
317   54    1  23.2  110.67  238  162.8  48.0  4.96  4.9127  108
239   55    1  28.2   91.00  250  140.2  67.0  4.00  5.3660  103
113   54    2  27.7  113.00  200  128.4  37.0  5.00  5.1533  113
..   ...  ...   ...     ...  ...    ...   ...   ...     ...  ...
381   29    2  18.1   73.00  158   99.0  41.0  4.00  4.4998   78
271   59    2  27.2  107.00  158  102.0  39.0  4.00  4.4427   93
61    37    2  26.8   79.00  157   98.0  28.0  6.00  5.0434   96
227   67    2  23.6  111.33  189  105.4  70.0  2.70  4.2195   93
128   34    1  20.6   87.00  185  112.2  58.0  3.00  4.3041   74

[353 rows x 10 columns]
     AGE  SEX   BMI      BP   S1     S2    S3    S4      S5   S6
294   55    2  23.5   93.00  177  126.8  41.0  4.00  3.8286   83
159   47    1  30.4  120.00  199  120.0  46.0  4.00  5.1059   87
312   28    1  24.2   93.00  174  106.4  54.0  3.00  4.2195   84
119   53    1  22.0   94.00  175   88.0  59.0  3.00  4.9416   98
134   28    1  30.4   85.00  198  115.6  67.0  3.00  4.3438   80
..   ...  ...   ...     ...  ...    ...   ...   ...     ...  ...
414   71    2  27.0   93.33  269  190.2  41.0  6.56  5.2417   93
166   33    2  20.8   84.00  125   70.2  46.0  3.00  3.7842   66
434   53    1  26.5   97.00  193  122.4  58.0  3.00  4.1431   99
390   51    2  32.8  112.00  202  100.6  37.0  5.00  5.7746  109
168   49    2  31.9   94.00  234  155.8  34.0  7.00  5.3982  122

[89 rows x 10 columns]
-----------------------------------------------------
96     150
333    164
317    190
239    262
113    297
      ... 
381    104
271    127
61     144
227    108
128    115
Name: Y, Length: 353, dtype: int64
294     55
159    195
312    144
119    200
134    103
      ... 
414    131
166     70
434     49
390    277
168    268
Name: Y, Length: 89, dtype: int64
from sklearn.linear_model import LinearRegression
L_regressor = LinearRegression()
L_regressor.fit(X_train, y_train)

#---- test model on test data X_test
prediction = L_regressor.predict(X_test)
print(prediction)
print(y_test)

plt.scatter(y_test, prediction)
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.title("Actual values vs predicted values")

[ 88.7257154  236.01349565 122.82671275 152.91194894 133.26299572
 155.10995617 108.45070799 166.16370245 206.40303033 227.12512886
  45.58436461 257.33397866 188.25079196 143.91214004 101.58303786
 129.18862987 157.18468301 140.00779148 142.28184775 115.48953396
 129.15211612 182.9636348   52.25034904 286.86546595 242.7761079
  46.07643676 146.37057691 125.90228716 137.64669731 102.12906943
 159.294184   127.19797503 120.2095064  189.73221207 121.94777703
  87.58871142 162.09127654 214.57935651 100.0105965  265.88447661
 237.09032754 102.23950004 115.64512814 193.83264705 185.47519786
 186.85928531 281.30774621  86.98229992 220.56270381 141.45606787
 101.56547169 173.03077936  96.25903702 159.50774879  53.63613547
 208.84468478 165.93698745 124.42552414 160.59146202 248.64622776
 103.47508113 148.42042807 205.43430394  93.90192239 161.46649216
  91.00134606 253.13344701  96.16425468 164.29821281 181.27263938
  58.24275384 116.28152401 183.49639474 169.54560591 110.11920824
 236.24602514  59.7852014  155.27187615 287.9524243  213.84063469
  82.59836521 259.69744698 178.24353173 107.16945725 167.34950979
  58.7571307  135.75101228 250.65478989 223.09415657]
294     55
159    195
312    144
119    200
134    103
      ... 
414    131
166     70
434     49
390    277
168    268
Name: Y, Length: 89, dtype: int64
Text(0.5, 1.0, 'Actual values vs predicted values')
../../_images/ef53d8bee238a22d5b6f849c1a4a71af1ffcb8e0352f856315e5f2ff2724d59a.png
from sklearn import metrics

print("Mean absolute error : ", metrics.mean_absolute_percentage_error(y_test, prediction))

print("Mean squared error :", metrics.mean_squared_error(y_test, prediction))
Mean absolute error :  0.3300922548696817
Mean squared error : 2228.090831059464
# Intercept and coefcient parameters

print(L_regressor.intercept_)

print("------------------------")
print(L_regressor.coef_)

# Map coeffients with features

coef_df = pd.DataFrame(L_regressor.coef_, X.columns, columns= ["Coeffcient"])
coef_df
-307.57104546505354
------------------------
[ 1.40608997e-01 -2.56128390e+01  5.08771783e+00  1.19293496e+00
 -1.08819176e+00  8.03227722e-01  4.24108443e-02  4.53744471e+00
  6.51304992e+01  3.71060693e-01]
Coeffcient
AGE 0.140609
SEX -25.612839
BMI 5.087718
BP 1.192935
S1 -1.088192
S2 0.803228
S3 0.042411
S4 4.537445
S5 65.130499
S6 0.371061

Logistic Regression#

import pandas as pd
dataset = pd.read_csv("/content/drive/MyDrive/Breast Cancer Data/Breast_cancer_data.csv")
print(dataset.head())
print("------------------------------------------------------------------------------")
print(dataset.columns)
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \
0  ...          17.33           184.60      2019.0            0.1622   
1  ...          23.41           158.80      1956.0            0.1238   
2  ...          25.53           152.50      1709.0            0.1444   
3  ...          26.50            98.87       567.7            0.2098   
4  ...          16.67           152.20      1575.0            0.1374   

   compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \
0             0.6656           0.7119                0.2654          0.4601   
1             0.1866           0.2416                0.1860          0.2750   
2             0.4245           0.4504                0.2430          0.3613   
3             0.8663           0.6869                0.2575          0.6638   
4             0.2050           0.4000                0.1625          0.2364   

   fractal_dimension_worst  Unnamed: 32  
0                  0.11890          NaN  
1                  0.08902          NaN  
2                  0.08758          NaN  
3                  0.17300          NaN  
4                  0.07678          NaN  

[5 rows x 33 columns]
------------------------------------------------------------------------------
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
dataset.T.head()
0 1 2 3 4 5 6 7 8 9 ... 559 560 561 562 563 564 565 566 567 568
id 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ... 925291 925292 925311 925622 926125 926424 926682 926954 927241 92751
diagnosis M M M M M M M M M M ... B B B M M M M M M B
radius_mean 17.99 20.57 19.69 11.42 20.29 12.45 18.25 13.71 13.0 12.46 ... 11.51 14.05 11.2 15.22 20.92 21.56 20.13 16.6 20.6 7.76
texture_mean 10.38 17.77 21.25 20.38 14.34 15.7 19.98 20.83 21.82 24.04 ... 23.93 27.15 29.37 30.62 25.09 22.39 28.25 28.08 29.33 24.54
perimeter_mean 122.8 132.9 130.0 77.58 135.1 82.57 119.6 90.2 87.5 83.97 ... 74.52 91.38 70.67 103.4 143.0 142.0 131.2 108.3 140.1 47.92

5 rows × 569 columns

# we can see dataset columns id and Unnamed:32 are not required , so we can drop those columns
dataset = dataset.drop(["id", "Unnamed: 32"], axis=1)
print(dataset.head())
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-20-926d8a94ccd3> in <module>()
      1 # we can see dataset columns id and Unnamed:32 are not required , so we can drop those columns
----> 2 dataset = dataset.drop(["id", "Unnamed: 32"], axis=1)
      3 print(dataset.head())

/usr/local/lib/python3.7/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4911             level=level,
   4912             inplace=inplace,
-> 4913             errors=errors,
   4914         )
   4915 

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
   4148         for axis, labels in axes.items():
   4149             if labels is not None:
-> 4150                 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   4151 
   4152         if inplace:

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
   4183                 new_axis = axis.drop(labels, level=level, errors=errors)
   4184             else:
-> 4185                 new_axis = axis.drop(labels, errors=errors)
   4186             result = self.reindex(**{axis_name: new_axis})
   4187 

/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
   6015         if mask.any():
   6016             if errors != "ignore":
-> 6017                 raise KeyError(f"{labels[mask]} not found in axis")
   6018             indexer = indexer[~mask]
   6019         return self.delete(indexer)

KeyError: "['id' 'Unnamed: 32'] not found in axis"

print(dataset.describe().T)
print("----------------------------------------------------------------------------------------------")
print(dataset.info())
                         count        mean         std         min  \
radius_mean              569.0   14.127292    3.524049    6.981000   
texture_mean             569.0   19.289649    4.301036    9.710000   
perimeter_mean           569.0   91.969033   24.298981   43.790000   
area_mean                569.0  654.889104  351.914129  143.500000   
smoothness_mean          569.0    0.096360    0.014064    0.052630   
compactness_mean         569.0    0.104341    0.052813    0.019380   
concavity_mean           569.0    0.088799    0.079720    0.000000   
concave points_mean      569.0    0.048919    0.038803    0.000000   
symmetry_mean            569.0    0.181162    0.027414    0.106000   
fractal_dimension_mean   569.0    0.062798    0.007060    0.049960   
radius_se                569.0    0.405172    0.277313    0.111500   
texture_se               569.0    1.216853    0.551648    0.360200   
perimeter_se             569.0    2.866059    2.021855    0.757000   
area_se                  569.0   40.337079   45.491006    6.802000   
smoothness_se            569.0    0.007041    0.003003    0.001713   
compactness_se           569.0    0.025478    0.017908    0.002252   
concavity_se             569.0    0.031894    0.030186    0.000000   
concave points_se        569.0    0.011796    0.006170    0.000000   
symmetry_se              569.0    0.020542    0.008266    0.007882   
fractal_dimension_se     569.0    0.003795    0.002646    0.000895   
radius_worst             569.0   16.269190    4.833242    7.930000   
texture_worst            569.0   25.677223    6.146258   12.020000   
perimeter_worst          569.0  107.261213   33.602542   50.410000   
area_worst               569.0  880.583128  569.356993  185.200000   
smoothness_worst         569.0    0.132369    0.022832    0.071170   
compactness_worst        569.0    0.254265    0.157336    0.027290   
concavity_worst          569.0    0.272188    0.208624    0.000000   
concave points_worst     569.0    0.114606    0.065732    0.000000   
symmetry_worst           569.0    0.290076    0.061867    0.156500   
fractal_dimension_worst  569.0    0.083946    0.018061    0.055040   

                                25%         50%          75%         max  
radius_mean               11.700000   13.370000    15.780000    28.11000  
texture_mean              16.170000   18.840000    21.800000    39.28000  
perimeter_mean            75.170000   86.240000   104.100000   188.50000  
area_mean                420.300000  551.100000   782.700000  2501.00000  
smoothness_mean            0.086370    0.095870     0.105300     0.16340  
compactness_mean           0.064920    0.092630     0.130400     0.34540  
concavity_mean             0.029560    0.061540     0.130700     0.42680  
concave points_mean        0.020310    0.033500     0.074000     0.20120  
symmetry_mean              0.161900    0.179200     0.195700     0.30400  
fractal_dimension_mean     0.057700    0.061540     0.066120     0.09744  
radius_se                  0.232400    0.324200     0.478900     2.87300  
texture_se                 0.833900    1.108000     1.474000     4.88500  
perimeter_se               1.606000    2.287000     3.357000    21.98000  
area_se                   17.850000   24.530000    45.190000   542.20000  
smoothness_se              0.005169    0.006380     0.008146     0.03113  
compactness_se             0.013080    0.020450     0.032450     0.13540  
concavity_se               0.015090    0.025890     0.042050     0.39600  
concave points_se          0.007638    0.010930     0.014710     0.05279  
symmetry_se                0.015160    0.018730     0.023480     0.07895  
fractal_dimension_se       0.002248    0.003187     0.004558     0.02984  
radius_worst              13.010000   14.970000    18.790000    36.04000  
texture_worst             21.080000   25.410000    29.720000    49.54000  
perimeter_worst           84.110000   97.660000   125.400000   251.20000  
area_worst               515.300000  686.500000  1084.000000  4254.00000  
smoothness_worst           0.116600    0.131300     0.146000     0.22260  
compactness_worst          0.147200    0.211900     0.339100     1.05800  
concavity_worst            0.114500    0.226700     0.382900     1.25200  
concave points_worst       0.064930    0.099930     0.161400     0.29100  
symmetry_worst             0.250400    0.282200     0.317900     0.66380  
fractal_dimension_worst    0.071460    0.080040     0.092080     0.20750  
----------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  569 non-null    float64
 15  smoothness_se            569 non-null    float64
 16  compactness_se           569 non-null    float64
 17  concavity_se             569 non-null    float64
 18  concave points_se        569 non-null    float64
 19  symmetry_se              569 non-null    float64
 20  fractal_dimension_se     569 non-null    float64
 21  radius_worst             569 non-null    float64
 22  texture_worst            569 non-null    float64
 23  perimeter_worst          569 non-null    float64
 24  area_worst               569 non-null    float64
 25  smoothness_worst         569 non-null    float64
 26  compactness_worst        569 non-null    float64
 27  concavity_worst          569 non-null    float64
 28  concave points_worst     569 non-null    float64
 29  symmetry_worst           569 non-null    float64
 30  fractal_dimension_worst  569 non-null    float64
dtypes: float64(30), object(1)
memory usage: 137.9+ KB
None
print(dataset.diagnosis.value_counts())
print(dataset.iloc[:,0])
print(dataset.head(2))
B    357
M    212
Name: diagnosis, dtype: int64
0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: diagnosis, Length: 569, dtype: object
  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0         M        17.99         10.38           122.8     1001.0   
1         M        20.57         17.77           132.9     1326.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2419  ...         25.38          17.33            184.6   
1         0.1812  ...         24.99          23.41            158.8   

   area_worst  smoothness_worst  compactness_worst  concavity_worst  \
0      2019.0            0.1622             0.6656           0.7119   
1      1956.0            0.1238             0.1866           0.2416   

   concave points_worst  symmetry_worst  fractal_dimension_worst  
0                0.2654          0.4601                  0.11890  
1                0.1860          0.2750                  0.08902  

[2 rows x 31 columns]
X = dataset.iloc[:, 1:]
print(X)
y= dataset.iloc[:, 0]
print(y)
     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
0             0.27760         0.30010              0.14710         0.2419   
1             0.07864         0.08690              0.07017         0.1812   
2             0.15990         0.19740              0.12790         0.2069   
3             0.28390         0.24140              0.10520         0.2597   
4             0.13280         0.19800              0.10430         0.1809   
..                ...             ...                  ...            ...   
564           0.11590         0.24390              0.13890         0.1726   
565           0.10340         0.14400              0.09791         0.1752   
566           0.10230         0.09251              0.05302         0.1590   
567           0.27700         0.35140              0.15200         0.2397   
568           0.04362         0.00000              0.00000         0.1587   

     fractal_dimension_mean  ...  radius_worst  texture_worst  \
0                   0.07871  ...        25.380          17.33   
1                   0.05667  ...        24.990          23.41   
2                   0.05999  ...        23.570          25.53   
3                   0.09744  ...        14.910          26.50   
4                   0.05883  ...        22.540          16.67   
..                      ...  ...           ...            ...   
564                 0.05623  ...        25.450          26.40   
565                 0.05533  ...        23.690          38.25   
566                 0.05648  ...        18.980          34.12   
567                 0.07016  ...        25.740          39.42   
568                 0.05884  ...         9.456          30.37   

     perimeter_worst  area_worst  smoothness_worst  compactness_worst  \
0             184.60      2019.0           0.16220            0.66560   
1             158.80      1956.0           0.12380            0.18660   
2             152.50      1709.0           0.14440            0.42450   
3              98.87       567.7           0.20980            0.86630   
4             152.20      1575.0           0.13740            0.20500   
..               ...         ...               ...                ...   
564           166.10      2027.0           0.14100            0.21130   
565           155.00      1731.0           0.11660            0.19220   
566           126.70      1124.0           0.11390            0.30940   
567           184.60      1821.0           0.16500            0.86810   
568            59.16       268.6           0.08996            0.06444   

     concavity_worst  concave points_worst  symmetry_worst  \
0             0.7119                0.2654          0.4601   
1             0.2416                0.1860          0.2750   
2             0.4504                0.2430          0.3613   
3             0.6869                0.2575          0.6638   
4             0.4000                0.1625          0.2364   
..               ...                   ...             ...   
564           0.4107                0.2216          0.2060   
565           0.3215                0.1628          0.2572   
566           0.3403                0.1418          0.2218   
567           0.9387                0.2650          0.4087   
568           0.0000                0.0000          0.2871   

     fractal_dimension_worst  
0                    0.11890  
1                    0.08902  
2                    0.08758  
3                    0.17300  
4                    0.07678  
..                       ...  
564                  0.07115  
565                  0.06637  
566                  0.07820  
567                  0.12400  
568                  0.07039  

[569 rows x 30 columns]
0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: diagnosis, Length: 569, dtype: object
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_norm = sc.fit_transform(X)
print(X_norm)
[[ 1.09706398 -2.07333501  1.26993369 ...  2.29607613  2.75062224
   1.93701461]
 [ 1.82982061 -0.35363241  1.68595471 ...  1.0870843  -0.24388967
   0.28118999]
 [ 1.57988811  0.45618695  1.56650313 ...  1.95500035  1.152255
   0.20139121]
 ...
 [ 0.70228425  2.0455738   0.67267578 ...  0.41406869 -1.10454895
  -0.31840916]
 [ 1.83834103  2.33645719  1.98252415 ...  2.28998549  1.91908301
   2.21963528]
 [-1.80840125  1.22179204 -1.81438851 ... -1.74506282 -0.04813821
  -0.75120669]]
# split normalized data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_norm,y, test_size= 0.2, random_state = 101)
# initialize the classifier
from sklearn.linear_model import LogisticRegression
Log_classifier =  LogisticRegression(random_state=0)
Log_classifier.fit(X_train, y_train)

#----
y_pred = Log_classifier.predict(X_test)
print(y_pred)

# compare y_pred vs actual y_test

print("-----------------------------------------------------------------")
print(y_test)

['B' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'B' 'B'
 'M' 'M' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'M' 'B' 'M' 'B' 'M' 'B' 'B'
 'M' 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'B' 'B' 'M'
 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'M' 'M' 'B' 'M' 'B'
 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'M'
 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'M' 'B' 'B' 'B' 'B' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'M']
-----------------------------------------------------------------
107    B
437    B
195    B
141    M
319    B
      ..
19     B
313    B
139    B
495    B
317    M
Name: diagnosis, Length: 114, dtype: object
# Model evaluation

from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix( y_test, y_pred)
print(cm)
print("---------------------------------------------------------------------------")
print("classification report \n", classification_report(y_test, y_pred))
[[72  0]
 [ 1 41]]
---------------------------------------------------------------------------
classification report 
               precision    recall  f1-score   support

           B       0.99      1.00      0.99        72
           M       1.00      0.98      0.99        42

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114
# --- Getting informative features by getting coeffcients

coeff_df = pd.DataFrame(Log_classifier.coef_.T, X.columns,
columns=['Coefficient'])
coeff_df =coeff_df.sort_values(by=['Coefficient'],ascending=False)
coeff_df

#  the features “radius_se” and “texture_worst” have a major contribution to determining the malignancy
Coefficient
radius_se 1.379673
texture_worst 1.169080
concave points_mean 1.156939
area_se 0.984579
concave points_worst 0.933089
radius_worst 0.903471
smoothness_worst 0.872462
area_worst 0.846826
concavity_mean 0.807876
concavity_worst 0.802358
perimeter_se 0.637758
fractal_dimension_worst 0.618413
perimeter_worst 0.568124
symmetry_worst 0.562905
texture_mean 0.387553
area_mean 0.314432
concave points_se 0.267092
radius_mean 0.266278
perimeter_mean 0.232904
compactness_worst 0.052849
symmetry_mean 0.042495
smoothness_mean -0.036783
symmetry_se -0.042753
smoothness_se -0.135624
texture_se -0.153152
concavity_se -0.224247
fractal_dimension_mean -0.378132
compactness_mean -0.590002
fractal_dimension_se -0.697663
compactness_se -0.780794
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10,4))
sns.boxplot(x="radius_se", y="diagnosis", data=dataset,ax=axes[0])
sns.boxplot(x='texture_worst',y='diagnosis',
data=dataset,ax=axes[1])
fig.tight_layout()
../../_images/7b0deb8d25875f2328051c1eed08567d14d813fb4383aefa95993bc54cb33333.png

Decision Trees and Random Forest#

# Decision tree is based on whole dataset,
# Where as Random FOrest incidently selects features to create different decsion trees

#--- Tree Pruning --- constructing too many decision trees leads to overfitting , so for real world data we have to restrict decison tree by keeping little impurities tolerable is called tree pruining

# Random forest trains multiple decision trees on sample data set and samples are selected randomly with replacements (Bootstrap sampling)
# Random forest model ~= Ensemble Methods

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv("/content/drive/MyDrive/Breast Cancer Data/Breast_cancer_data.csv")
print(dataset.head())
print("------------------------------------------------------------------------------")
print(dataset.columns)
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \
0  ...          17.33           184.60      2019.0            0.1622   
1  ...          23.41           158.80      1956.0            0.1238   
2  ...          25.53           152.50      1709.0            0.1444   
3  ...          26.50            98.87       567.7            0.2098   
4  ...          16.67           152.20      1575.0            0.1374   

   compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \
0             0.6656           0.7119                0.2654          0.4601   
1             0.1866           0.2416                0.1860          0.2750   
2             0.4245           0.4504                0.2430          0.3613   
3             0.8663           0.6869                0.2575          0.6638   
4             0.2050           0.4000                0.1625          0.2364   

   fractal_dimension_worst  Unnamed: 32  
0                  0.11890          NaN  
1                  0.08902          NaN  
2                  0.08758          NaN  
3                  0.17300          NaN  
4                  0.07678          NaN  

[5 rows x 33 columns]
------------------------------------------------------------------------------
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
# dataset = dataset.drop(["id", 'Unnamed: 32'], axis=1)
dataset.columns
Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')
#--- Train test split

X = dataset.iloc[:, 1:]
y = dataset.iloc[:, 0]
print(X)
print(":=========================:")
print(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
print(X_train, X_test)

# Decision tree call and train on data

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
DecisionTreeClassifier()
# prediction and evaluation

preds = classifier.predict(X_test)

# Evaluation

from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           B       0.94      0.88      0.91       105
           M       0.82      0.91      0.86        66

    accuracy                           0.89       171
   macro avg       0.88      0.89      0.88       171
weighted avg       0.89      0.89      0.89       171
# random forest from ensemble methods

from sklearn.ensemble import RandomForestClassifier

rfc_classifier = RandomForestClassifier(n_estimators= 300)

rfc_classifier.fit(X_train, y_train)

rfc_pred = rfc_classifier.predict(X_test)
print(classification_report(y_test, rfc_pred))
              precision    recall  f1-score   support

           B       0.99      0.96      0.98       105
           M       0.94      0.98      0.96        66

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171
#Cross validation and prediction

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100)
cross_validator = cross_validate(classifier, X, y, cv=5,scoring = 'accuracy',return_estimator = 'true')
print(cross_validator['test_score'])
[0.93859649 0.94736842 0.99122807 0.97368421 0.98230088]
# so highest value from above is best model .
# here best model is 0.00122807

best_model = cross_validator["estimator"][-1]
best_model

rfc_best_pred = best_model.predict(X_test)
print(classification_report(y_test, rfc_best_pred))
              precision    recall  f1-score   support

           B       1.00      0.99      1.00       105
           M       0.99      1.00      0.99        66

    accuracy                           0.99       171
   macro avg       0.99      1.00      0.99       171
weighted avg       0.99      0.99      0.99       171

Support vector Machines (SVMs)#

# Support vector machines are linear classifier , they make linear decision boundaries
# SVMs goal is to find the Hyperplane or decision boundaries , which best fit and seperates n dimensional into classes .
# if suppose one dimensional data is not having linear seperable (as of real world data), then we use kernel trick to ass another feature (dimension) there by separating points or data
# Radiab basis function (RBF ) is an example
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#--- Wheat seed Dataset

dataset = pd.read_csv("/content/drive/MyDrive/Wheat_seed_dataset/wheat_seeds.csv")
print(dataset.info())
dataset.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             199 non-null    float64
 1   Perimeter        199 non-null    float64
 2   Compactness      199 non-null    float64
 3   Kernel.Length    199 non-null    float64
 4   Kernel.Width     199 non-null    float64
 5   Asymmetry.Coeff  199 non-null    float64
 6   Kernel.Groove    199 non-null    float64
 7   Type             199 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 12.6 KB
None
Area Perimeter Compactness Kernel.Length Kernel.Width Asymmetry.Coeff Kernel.Groove Type
0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 1
1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 1
2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 1
3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 1
4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 1
dataset["Type"].value_counts()
2    68
1    66
3    65
Name: Type, dtype: int64
dataset.isnull().sum()
Area               0
Perimeter          0
Compactness        0
Kernel.Length      0
Kernel.Width       0
Asymmetry.Coeff    0
Kernel.Groove      0
Type               0
dtype: int64
# Split data Train Test split

X = dataset.iloc[:, 0:-1]
print(X)
y= dataset.iloc[:, -1]
print(y)

from sklearn.model_selection import train_test_split
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

print(X_train)
print(X_test)
      Area  Perimeter  Compactness  Kernel.Length  Kernel.Width  \
0    15.26      14.84       0.8710          5.763         3.312   
1    14.88      14.57       0.8811          5.554         3.333   
2    14.29      14.09       0.9050          5.291         3.337   
3    13.84      13.94       0.8955          5.324         3.379   
4    16.14      14.99       0.9034          5.658         3.562   
..     ...        ...          ...            ...           ...   
194  12.19      13.20       0.8783          5.137         2.981   
195  11.23      12.88       0.8511          5.140         2.795   
196  13.20      13.66       0.8883          5.236         3.232   
197  11.84      13.21       0.8521          5.175         2.836   
198  12.30      13.34       0.8684          5.243         2.974   

     Asymmetry.Coeff  Kernel.Groove  
0              2.221          5.220  
1              1.018          4.956  
2              2.699          4.825  
3              2.259          4.805  
4              1.355          5.175  
..               ...            ...  
194            3.631          4.870  
195            4.325          5.003  
196            8.315          5.056  
197            3.598          5.044  
198            5.637          5.063  

[199 rows x 7 columns]
0      1
1      1
2      1
3      1
4      1
      ..
194    3
195    3
196    3
197    3
198    3
Name: Type, Length: 199, dtype: int64
      Area  Perimeter  Compactness  Kernel.Length  Kernel.Width  \
24   16.19      15.16       0.8849          5.833         3.421   
43   13.80      14.04       0.8794          5.376         3.155   
101  18.83      16.29       0.8917          6.037         3.786   
142  12.70      13.71       0.8491          5.386         2.911   
21   15.88      14.90       0.8988          5.618         3.507   
..     ...        ...          ...            ...           ...   
113  18.89      16.23       0.9008          6.227         3.769   
64   14.01      14.29       0.8625          5.609         3.158   
15   13.99      13.83       0.9183          5.119         3.383   
125  18.30      15.89       0.9108          5.979         3.755   
9    15.26      14.85       0.8696          5.714         3.242   

     Asymmetry.Coeff  Kernel.Groove  
24            0.9030          5.307  
43            1.5600          4.961  
101           2.5530          5.879  
142           3.2600          5.316  
21            0.7651          5.091  
..               ...            ...  
113           3.6390          5.966  
64            2.2170          5.132  
15            5.2340          4.781  
125           2.8370          5.962  
9             4.5430          5.314  

[149 rows x 7 columns]
      Area  Perimeter  Compactness  Kernel.Length  Kernel.Width  \
59   12.36      13.19       0.8923          5.076         3.042   
5    14.38      14.21       0.8951          5.386         3.312   
20   14.11      14.26       0.8722          5.520         3.168   
124  17.55      15.66       0.8991          5.791         3.690   
52   14.52      14.60       0.8557          5.741         3.113   
19   14.16      14.40       0.8584          5.658         3.129   
161  12.15      13.45       0.8443          5.417         2.837   
55   14.92      14.43       0.9006          5.384         3.412   
69   19.11      16.26       0.9081          6.154         3.930   
2    14.29      14.09       0.9050          5.291         3.337   
98   19.46      16.50       0.8985          6.113         3.892   
10   14.03      14.16       0.8796          5.438         3.201   
75   17.12      15.55       0.8892          5.850         3.566   
134  13.07      13.92       0.8480          5.472         2.994   
193  12.37      13.47       0.8567          5.204         2.960   
63   14.34      14.37       0.8726          5.630         3.190   
110  19.06      16.45       0.8854          6.416         3.719   
78   20.20      16.89       0.8894          6.285         3.864   
178  10.91      12.80       0.8372          5.088         2.675   
114  20.03      16.90       0.8811          6.493         3.857   
149  11.19      13.05       0.8253          5.250         2.675   
129  15.56      14.89       0.8823          5.776         3.408   
61   12.78      13.57       0.8716          5.262         3.026   
87   18.76      16.20       0.8984          6.172         3.796   
102  17.63      15.86       0.8800          6.033         3.573   
120  18.75      16.18       0.8999          6.111         3.869   
168  10.74      12.73       0.8329          5.145         2.642   
1    14.88      14.57       0.8811          5.554         3.333   
47   14.86      14.67       0.8676          5.678         3.258   
171  11.41      12.95       0.8560          5.090         2.775   
185  12.11      13.27       0.8639          5.236         2.975   
39   13.50      13.85       0.8852          5.351         3.158   
76   16.53      15.34       0.8823          5.875         3.467   
91   16.87      15.65       0.8648          6.139         3.463   
35   17.08      15.38       0.9079          5.832         3.683   
121  18.65      16.41       0.8698          6.285         3.594   
169  11.48      13.05       0.8473          5.180         2.758   
162  11.35      13.12       0.8291          5.176         2.668   
46   14.79      14.52       0.8819          5.545         3.291   
173  12.19      13.36       0.8579          5.240         2.909   
189  12.62      13.67       0.8481          5.410         2.911   
7    16.63      15.46       0.8747          6.053         3.465   
26   12.74      13.67       0.8564          5.395         2.956   
148  11.36      13.05       0.8382          5.175         2.755   
58   11.23      12.63       0.8840          4.902         2.879   
72   17.32      15.91       0.8599          6.064         3.403   
103  19.94      16.92       0.8752          6.675         3.763   
198  12.30      13.34       0.8684          5.243         2.974   
56   15.38      14.77       0.8857          5.662         3.419   
184  10.82      12.83       0.8256          5.180         2.630   

     Asymmetry.Coeff  Kernel.Groove  
59             3.220          4.605  
5              2.462          4.956  
20             2.688          5.219  
124            5.366          5.661  
52             1.481          5.487  
19             3.072          5.176  
161            3.638          5.338  
55             1.142          5.088  
69             2.936          6.079  
2              2.699          4.825  
98             4.308          6.009  
10             1.717          5.001  
75             2.858          5.746  
134            5.304          5.395  
193            3.919          5.001  
63             1.313          5.150  
110            2.248          6.163  
78             5.173          6.187  
178            4.179          4.956  
114            3.063          6.320  
149            5.813          5.219  
129            4.972          5.847  
61             1.176          4.782  
87             3.120          6.053  
102            3.747          5.929  
120            4.188          5.992  
168            4.702          4.963  
1              1.018          4.956  
47             2.129          5.351  
171            4.957          4.825  
185            4.132          5.012  
39             2.249          5.176  
76             5.532          5.880  
91             3.696          5.967  
35             2.956          5.484  
121            4.391          6.102  
169            5.876          5.002  
162            4.337          5.132  
46             2.704          5.111  
173            4.857          5.158  
189            3.306          5.231  
7              2.040          5.877  
26             2.504          4.869  
148            4.048          5.263  
58             2.269          4.703  
72             3.824          5.922  
103            3.252          6.550  
198            5.637          5.063  
56             1.999          5.222  
184            4.853          5.089  
# Training SVM classifier and Tuning hyperparameters using GridsearchCV

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

#--- define parameters set using dictionary

param = { "C" : [1.0, 10, 100, 100] , "gamma" : [1, 0.1, 0.01, 0.001, 10], "kernel" : ["rbf"]}

grid = GridSearchCV(SVC(), param_grid= param, refit= True, verbose= 3)

grid.fit(X_train, y_train)
# best grid search param

best_param = grid.best_params_
print(best_param)
{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
grid_prediction = grid.predict(X_test)
grid_prediction


# report
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, grid_prediction))
print("===============================")
print(classification_report(y_test, grid_prediction))
[[18  1  0]
 [ 0 16  0]
 [ 0  0 15]]
===============================
              precision    recall  f1-score   support

           1       1.00      0.95      0.97        19
           2       0.94      1.00      0.97        16
           3       1.00      1.00      1.00        15

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50

Neural Network and Deep learning#