## Loading required package: class
## Loading required package: gmodels
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: rpart
## Loading required package: rpart.plot
## Loading required package: data.table
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## Loading required package: plotROC
## 
## Attaching package: 'plotROC'
## The following object is masked from 'package:pROC':
## 
##     ggroc
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
##  sex          length         diameter          height      
##  F:1307   Min.   :0.075   Min.   :0.0550   Min.   :0.0000  
##  I:1342   1st Qu.:0.450   1st Qu.:0.3500   1st Qu.:0.1150  
##  M:1528   Median :0.545   Median :0.4250   Median :0.1400  
##           Mean   :0.524   Mean   :0.4079   Mean   :0.1395  
##           3rd Qu.:0.615   3rd Qu.:0.4800   3rd Qu.:0.1650  
##           Max.   :0.815   Max.   :0.6500   Max.   :1.1300  
##   whole_weight    shucked_wieght   viscera_wieght    shell_weight   
##  Min.   :0.0020   Min.   :0.0010   Min.   :0.0005   Min.   :0.0015  
##  1st Qu.:0.4415   1st Qu.:0.1860   1st Qu.:0.0935   1st Qu.:0.1300  
##  Median :0.7995   Median :0.3360   Median :0.1710   Median :0.2340  
##  Mean   :0.8287   Mean   :0.3594   Mean   :0.1806   Mean   :0.2388  
##  3rd Qu.:1.1530   3rd Qu.:0.5020   3rd Qu.:0.2530   3rd Qu.:0.3290  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.7600   Max.   :1.0050  
##      rings       
##  Min.   : 1.000  
##  1st Qu.: 8.000  
##  Median : 9.000  
##  Mean   : 9.934  
##  3rd Qu.:11.000  
##  Max.   :29.000
## 'data.frame':    4177 obs. of  9 variables:
##  $ sex           : Factor w/ 3 levels "F","I","M": 3 3 1 3 2 2 1 1 3 1 ...
##  $ length        : num  0.455 0.35 0.53 0.44 0.33 0.425 0.53 0.545 0.475 0.55 ...
##  $ diameter      : num  0.365 0.265 0.42 0.365 0.255 0.3 0.415 0.425 0.37 0.44 ...
##  $ height        : num  0.095 0.09 0.135 0.125 0.08 0.095 0.15 0.125 0.125 0.15 ...
##  $ whole_weight  : num  0.514 0.226 0.677 0.516 0.205 ...
##  $ shucked_wieght: num  0.2245 0.0995 0.2565 0.2155 0.0895 ...
##  $ viscera_wieght: num  0.101 0.0485 0.1415 0.114 0.0395 ...
##  $ shell_weight  : num  0.15 0.07 0.21 0.155 0.055 0.12 0.33 0.26 0.165 0.32 ...
##  $ rings         : int  15 7 9 10 7 8 20 16 9 19 ...

## young adult   old 
##  1407  1810   960

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1244  0.2253  0.2410  0.3369  1.0000

It is said that a good K could be the square root of the number of observations, and since we are working with two classes I chose an odd number K when seeing if the accuracy would be higher if a lower K was selected. It is still circa 68% so this might not be ideal for the data so misclassification would be 32%.

ind <- sample(2, nrow(z), replace=TRUE, prob=c(0.8, 0.2))
KNNtrain <- z[ind==1,]
KNNtest <- z[ind==2,]
KNNpred <- knn(train = KNNtrain[1:7], test = KNNtest[1:7], cl = KNNtrain$rings, k = 54, prob=TRUE)


CrossTable(x = KNNtest$rings, y = KNNpred, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  845 
## 
##  
##               | KNNpred 
## KNNtest$rings |     young |     adult |       old | Row Total | 
## --------------|-----------|-----------|-----------|-----------|
##         young |       209 |        65 |         6 |       280 | 
##               |     0.746 |     0.232 |     0.021 |     0.331 | 
##               |     0.733 |     0.143 |     0.057 |           | 
##               |     0.247 |     0.077 |     0.007 |           | 
## --------------|-----------|-----------|-----------|-----------|
##         adult |        64 |       291 |        22 |       377 | 
##               |     0.170 |     0.772 |     0.058 |     0.446 | 
##               |     0.225 |     0.640 |     0.210 |           | 
##               |     0.076 |     0.344 |     0.026 |           | 
## --------------|-----------|-----------|-----------|-----------|
##           old |        12 |        99 |        77 |       188 | 
##               |     0.064 |     0.527 |     0.410 |     0.222 | 
##               |     0.042 |     0.218 |     0.733 |           | 
##               |     0.014 |     0.117 |     0.091 |           | 
## --------------|-----------|-----------|-----------|-----------|
##  Column Total |       285 |       455 |       105 |       845 | 
##               |     0.337 |     0.538 |     0.124 |           | 
## --------------|-----------|-----------|-----------|-----------|
## 
## 
(209 + 291 + 77) / nrow(KNNtest)
## [1] 0.6828402
KNNpred2 <- knn(train = KNNtrain[1:7], test = KNNtest[1:7], cl = KNNtrain$rings, k = 23)


CrossTable(x = KNNtest$rings, y = KNNpred2, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  845 
## 
##  
##               | KNNpred2 
## KNNtest$rings |     young |     adult |       old | Row Total | 
## --------------|-----------|-----------|-----------|-----------|
##         young |       217 |        57 |         6 |       280 | 
##               |     0.775 |     0.204 |     0.021 |     0.331 | 
##               |     0.753 |     0.129 |     0.052 |           | 
##               |     0.257 |     0.067 |     0.007 |           | 
## --------------|-----------|-----------|-----------|-----------|
##         adult |        62 |       286 |        29 |       377 | 
##               |     0.164 |     0.759 |     0.077 |     0.446 | 
##               |     0.215 |     0.647 |     0.252 |           | 
##               |     0.073 |     0.338 |     0.034 |           | 
## --------------|-----------|-----------|-----------|-----------|
##           old |         9 |        99 |        80 |       188 | 
##               |     0.048 |     0.527 |     0.426 |     0.222 | 
##               |     0.031 |     0.224 |     0.696 |           | 
##               |     0.011 |     0.117 |     0.095 |           | 
## --------------|-----------|-----------|-----------|-----------|
##  Column Total |       288 |       442 |       115 |       845 | 
##               |     0.341 |     0.523 |     0.136 |           | 
## --------------|-----------|-----------|-----------|-----------|
## 
## 
(217 + 286 + 80) / nrow(KNNtest) # accuracy
## [1] 0.6899408

There wasn’t much of a difference between the prediction of the KNN and randomforest model when it comes to accuracy.

set.seed(55)
RFtrain <- KNNtrain
RFtest <- KNNtest

modelRF <- randomForest(rings ~., data = RFtrain)
modelRF
## 
## Call:
##  randomForest(formula = rings ~ ., data = RFtrain) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 33.1%
## Confusion matrix:
##       young adult old class.error
## young   834   280  13   0.2599823
## adult   230  1007 196   0.2972784
## old      35   349 388   0.4974093
prediction <- predict(modelRF, newdata = RFtest)

CrossTable(x = RFtest$rings, y = prediction, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  845 
## 
##  
##              | prediction 
## RFtest$rings |     young |     adult |       old | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##        young |       212 |        63 |         5 |       280 | 
##              |     0.757 |     0.225 |     0.018 |     0.331 | 
##              |     0.752 |     0.156 |     0.032 |           | 
##              |     0.251 |     0.075 |     0.006 |           | 
## -------------|-----------|-----------|-----------|-----------|
##        adult |        65 |       261 |        51 |       377 | 
##              |     0.172 |     0.692 |     0.135 |     0.446 | 
##              |     0.230 |     0.644 |     0.323 |           | 
##              |     0.077 |     0.309 |     0.060 |           | 
## -------------|-----------|-----------|-----------|-----------|
##          old |         5 |        81 |       102 |       188 | 
##              |     0.027 |     0.431 |     0.543 |     0.222 | 
##              |     0.018 |     0.200 |     0.646 |           | 
##              |     0.006 |     0.096 |     0.121 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |       282 |       405 |       158 |       845 | 
##              |     0.334 |     0.479 |     0.187 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
## 
(212 + 261 + 102) / nrow(RFtest) # accuracy
## [1] 0.6804734

With 10-fold cross validation with KNN performed optimally with k=9 and Random Forests with mkt=2.

set.seed(55)
controlknn = trainControl(method="repeatedcv", number=10, repeats=3)
model2 <- train(rings~., data = KNNtrain, method = "knn", preProcess="scale", trControl=controlknn)
model2
## k-Nearest Neighbors 
## 
## 3332 samples
##    7 predictor
##    3 classes: 'young', 'adult', 'old' 
## 
## Pre-processing: scaled (7) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2999, 3000, 2999, 2998, 2998, 2999, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.6414504  0.4372751
##   7  0.6499590  0.4483162
##   9  0.6595692  0.4621399
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
controlRF = trainControl(method="repeatedcv", number=10, repeats=3)
model3 <- train(rings~., data = KNNtrain, method = "rf", preProcess="scale", trControl=controlRF)
model3
## Random Forest 
## 
## 3332 samples
##    7 predictor
##    3 classes: 'young', 'adult', 'old' 
## 
## Pre-processing: scaled (7) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2998, 2999, 3000, 2999, 2999, 2998, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.6734786  0.4864738
##   4     0.6730833  0.4864804
##   7     0.6633702  0.4718188
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.