## Loading required package: e1071
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Loading required package: kernlab
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## Loading required package: nnet
## Loading required package: neuralnet
## Loading required package: class
## Loading required package: gmodels
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
## Loading required package: NeuralNetTools
## Loading required package: data.table

I named the columns and then converted them from factors to numeric as depicted by str() performed before and after transformation. I also dropped veil_type(V17) since it only has one level and may create errors. I also dropped stalk_root since it has too many missing values.

colnames(mushroom) <- c("edibility", "cap_shape", "cap_surface", "cap_color", "bruises", "odor", "grill_attachment", "grill_spacing", "grill_size", "grill_color", "stalk_shape", "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring", "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", "ring_type", "spore_print_color", "population", "habitat")
sum(is.na(mushroom))
## [1] 0
mushroom <- subset(mushroom, select = -veil_type)
mushroom <- subset(mushroom, select = -stalk_root)
mushroom$cap_shape <- as.numeric(mushroom$cap_shape)
mushroom$cap_surface <- as.numeric(mushroom$cap_surface)
mushroom$cap_color <- as.numeric(mushroom$cap_color)
mushroom$bruises <- as.numeric(mushroom$bruises)
mushroom$odor <- as.numeric(mushroom$odor)
mushroom$grill_attachment <- as.numeric(mushroom$grill_attachment)
mushroom$grill_spacing <- as.numeric(mushroom$grill_spacing)
mushroom$grill_size <- as.numeric(mushroom$grill_size)
mushroom$grill_color <- as.numeric(mushroom$grill_color)
mushroom$stalk_shape <- as.numeric(mushroom$stalk_shape)

mushroom$stalk_surface_above_ring <- as.numeric(mushroom$cap_shape)
mushroom$stalk_surface_below_ring <- as.numeric(mushroom$cap_shape)
mushroom$stalk_color_above_ring <- as.numeric(mushroom$stalk_color_above_ring)
mushroom$stalk_color_below_ring <- as.numeric(mushroom$stalk_color_below_ring)
mushroom$veil_color <- as.numeric(mushroom$veil_color)
mushroom$ring_number <- as.numeric(mushroom$ring_number)
mushroom$ring_type <- as.numeric(mushroom$ring_type)
mushroom$spore_print_color <- as.numeric(mushroom$spore_print_color)
mushroom$population <- as.numeric(mushroom$population)
mushroom$habitat <- as.numeric(mushroom$habitat)
shrooms <- as.numeric(mushroom$edibility)

After normailizing the data frame I created dummy variables partitioning the data so that p=80%. I also split the data into test and train sets.

normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
normalize(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
normalize(c(10, 20, 30, 40, 50))
## [1] 0.00 0.25 0.50 0.75 1.00
mushroom[2:21] <- as.data.frame(lapply(mushroom[2:21], normalize))

summary(mushroom[2:21]$edibility)
## Length  Class   Mode 
##      0   NULL   NULL
summary(mushroom$edibility)
##    e    p 
## 4208 3916
dum <- createDataPartition(mushroom$edibility, p = .8, list = FALSE)
dummy <- subset(mushroom, select = -edibility)
mushDummy <- dummyVars(~., data = dummy, sep = ".")
mushDummy <- data.frame(predict(mushDummy, dummy))
ncol(mushDummy)
## [1] 20
mushDummy$edibility <- mushroom$edibility
ncol(mushDummy)
## [1] 21
mushtrain <- mushDummy[dum,]
mushtest <- mushDummy[-dum,]
mushtestLabels <- subset(mushtest, select = edibility)
mushtestset <- subset(mushtest, select = -edibility)

I attempted building a neural network first using the nnet package. The confusion matrix shows an accuracy of 98% with 2 units in a hidden layer with p-value 2.2e-16, which is the smallest number larger than 0 that can be stored by the floating system in our computer so I would say that is pretty significant. However, Mcnemar’s Test P-Value is 27%, which isn’t the greatest.

I am not sure how to create another layer here and I did not see an activation function option, which would only be helpful to send an output signal as an input signal into another layer, if there were one. Which basically makes this a linear regression model.

I increased the number of units in the hidden layer to 5 and dropped to 96% accuracy with Mcnemar’s Test P-Value at 2.2e-16.

Rang=initial random weights on [-rang,rang] given a value of about 0.5 unless the inputs are large, then it should be chosen so that rang* max(|x|) is about 1. I kept this at .1 for the preceding models, but decided to bump it to 0.5 for the last model.

Maxit=maximum number of iterations where the default is 100. This means a batch of data is being passed through the neural network 100 times by default. The preceding models have 200 iterations so I increased it to 300

I kept the hidden layer at 5, rang(intitial random weights) at 0.5, and maxit(iterations) at 300 I get a model with 99% accuracy with a p-value < 2.2e-16, and Mcnemar’s Test P-Value < 2.2e-16. This seems like a good stop point for fear of overfitting.

set.seed(30)
net <- nnet(edibility ~ ., data = mushtrain, size = 2, rang = 0.1, maxit = 200)
## # weights:  45
## initial  value 4509.713564 
## iter  10 value 1616.914240
## iter  20 value 885.624455
## iter  30 value 665.810578
## iter  40 value 646.004397
## iter  50 value 643.086721
## iter  60 value 642.631000
## iter  70 value 642.403752
## iter  80 value 642.310837
## iter  90 value 642.253186
## final  value 642.251126 
## converged
summary(net)
## a 20-2-1 network with 45 weights
## options were - entropy fitting 
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1 
##  178.23   -4.38    3.54   -0.38  227.94  542.47 -192.13   30.80  129.50 
##  i9->h1 i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 
##   -0.95  -78.18   -4.40   -4.37   20.28   33.66  -51.82 -411.66 -155.78 
## i18->h1 i19->h1 i20->h1 
## -155.35   38.03   25.01 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2 
##   47.66   -0.43   -3.69    0.02  140.08   59.01  -10.62  191.25 -151.84 
##  i9->h2 i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 
##  -16.70  -98.00   -0.34   -0.32   -8.09  -31.96 -178.88  126.40  -12.62 
## i18->h2 i19->h2 i20->h2 
##   43.06    4.21  -25.65 
##    b->o   h1->o   h2->o 
##    6.63   -4.33   -8.93
mush.predict <- predict(net, mushtestset, type = "class")

net.table <- table(mushtest$edibility, mush.predict)
net.table
##    mush.predict
##       e   p
##   e 817  24
##   p  16 767
caret::confusionMatrix(net.table)
## Confusion Matrix and Statistics
## 
##    mush.predict
##       e   p
##   e 817  24
##   p  16 767
##                                           
##                Accuracy : 0.9754          
##                  95% CI : (0.9666, 0.9823)
##     No Information Rate : 0.5129          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9507          
##                                           
##  Mcnemar's Test P-Value : 0.2684          
##                                           
##             Sensitivity : 0.9808          
##             Specificity : 0.9697          
##          Pos Pred Value : 0.9715          
##          Neg Pred Value : 0.9796          
##              Prevalence : 0.5129          
##          Detection Rate : 0.5031          
##    Detection Prevalence : 0.5179          
##       Balanced Accuracy : 0.9752          
##                                           
##        'Positive' Class : e               
## 
net2 <- nnet(edibility ~ ., data = mushtrain, size = 5, rang = 0.1, maxit = 200)
## # weights:  111
## initial  value 4507.787188 
## iter  10 value 1947.770702
## iter  20 value 1350.848665
## iter  30 value 908.981608
## iter  40 value 619.100238
## iter  50 value 602.281717
## iter  60 value 599.229080
## iter  70 value 599.178603
## iter  80 value 591.113614
## iter  90 value 546.426766
## iter 100 value 545.646369
## iter 110 value 545.629258
## iter 120 value 545.609648
## iter 130 value 545.605617
## iter 140 value 545.601707
## iter 150 value 545.601060
## iter 160 value 545.597053
## final  value 545.597009 
## converged
summary(net2)
## a 20-5-1 network with 111 weights
## options were - entropy fitting 
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1 
##    8.29   -0.44   -0.26    0.25   26.07  -30.64   27.57   -4.11  -12.65 
##  i9->h1 i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 
##   -3.29  -37.28   -0.58   -0.58    2.36    2.59   18.91  -10.88   14.54 
## i18->h1 i19->h1 i20->h1 
##  -46.12  -21.08   -2.11 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2 
##   -4.53    0.15   -2.93  -18.11   19.99    6.88    4.88   15.45   -1.55 
##  i9->h2 i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 
##   -0.04   -7.98    0.29    0.11   12.89    6.57  -11.94   10.16   17.86 
## i18->h2 i19->h2 i20->h2 
##  -24.65    9.83    1.49 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3  i5->h3  i6->h3  i7->h3  i8->h3 
##    7.53    1.25    1.65    4.27    9.58    2.58    6.14    4.79   -9.70 
##  i9->h3 i10->h3 i11->h3 i12->h3 i13->h3 i14->h3 i15->h3 i16->h3 i17->h3 
##    7.11    2.18    1.12    1.07    4.68    4.92    4.59    4.84    9.13 
## i18->h3 i19->h3 i20->h3 
##    0.74    0.74   -0.08 
##   b->h4  i1->h4  i2->h4  i3->h4  i4->h4  i5->h4  i6->h4  i7->h4  i8->h4 
##    5.44    4.58    5.14    0.98    2.56    5.15    5.61    0.25    4.71 
##  i9->h4 i10->h4 i11->h4 i12->h4 i13->h4 i14->h4 i15->h4 i16->h4 i17->h4 
##    0.66    9.69    4.62    4.62    5.42    5.44    3.56    3.19    0.88 
## i18->h4 i19->h4 i20->h4 
##    6.68    4.77    0.80 
##   b->h5  i1->h5  i2->h5  i3->h5  i4->h5  i5->h5  i6->h5  i7->h5  i8->h5 
##  -11.60    0.14   -1.22   -0.11   17.51   -7.34   -5.14    6.34  -26.59 
##  i9->h5 i10->h5 i11->h5 i12->h5 i13->h5 i14->h5 i15->h5 i16->h5 i17->h5 
##    0.64   10.32    0.16    0.13    5.95    6.16  -15.75   23.43   12.55 
## i18->h5 i19->h5 i20->h5 
##   -1.16   -2.57    0.20 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o 
##  -4.95  41.43 -37.92  26.60  -5.61 -20.23
mush.predict2 <- predict(net2, mushtestset, type = "class")

net.table2 <- table(mushtest$edibility, mush.predict2)
net.table2
##    mush.predict2
##       e   p
##   e 841   0
##   p  73 710
caret::confusionMatrix(net.table2)
## Confusion Matrix and Statistics
## 
##    mush.predict2
##       e   p
##   e 841   0
##   p  73 710
##                                           
##                Accuracy : 0.955           
##                  95% CI : (0.9438, 0.9646)
##     No Information Rate : 0.5628          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9097          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9201          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9068          
##              Prevalence : 0.5628          
##          Detection Rate : 0.5179          
##    Detection Prevalence : 0.5179          
##       Balanced Accuracy : 0.9601          
##                                           
##        'Positive' Class : e               
## 
net3 <- nnet(edibility ~ ., data = mushtrain, size = 5, rang = 0.5, maxit = 300)
## # weights:  111
## initial  value 4645.598183 
## iter  10 value 1731.881070
## iter  20 value 463.949433
## iter  30 value 302.321227
## iter  40 value 287.142980
## iter  50 value 286.920952
## iter  60 value 286.841977
## iter  70 value 285.619689
## iter  80 value 281.242699
## iter  90 value 276.455631
## iter 100 value 270.048112
## iter 110 value 266.763770
## iter 120 value 265.696896
## iter 130 value 202.577197
## iter 140 value 193.524861
## iter 150 value 193.295270
## iter 160 value 193.257614
## iter 170 value 193.241518
## iter 180 value 193.223312
## iter 190 value 193.210677
## iter 200 value 193.205754
## iter 210 value 193.204800
## final  value 193.204565 
## converged
summary(net3)
## a 20-5-1 network with 111 weights
## options were - entropy fitting 
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1 
##  -11.38    1.65   48.66   -4.89   17.91   11.83  -13.11   10.36   14.61 
##  i9->h1 i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 
##   -7.74  -51.37    1.68    1.71  -16.75  -22.70   -8.87   -7.65   57.04 
## i18->h1 i19->h1 i20->h1 
##  -31.88    5.36    5.89 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2 
##    4.16    0.60    0.12   -1.50  -17.76   -1.87    2.83   -3.94   23.43 
##  i9->h2 i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 
##    0.83  -20.26    0.02    0.40   -2.85   -2.68    8.37   -7.02   -8.26 
## i18->h2 i19->h2 i20->h2 
##    3.84    0.92    1.73 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3  i5->h3  i6->h3  i7->h3  i8->h3 
##  -11.37    1.14   -5.52   -1.64   11.78   17.66   -3.36   34.93  -10.64 
##  i9->h3 i10->h3 i11->h3 i12->h3 i13->h3 i14->h3 i15->h3 i16->h3 i17->h3 
##    5.26   18.00    1.12    0.87   -4.27  -18.59   -8.47  -10.71   -6.78 
## i18->h3 i19->h3 i20->h3 
##   -0.27   19.79  -15.02 
##   b->h4  i1->h4  i2->h4  i3->h4  i4->h4  i5->h4  i6->h4  i7->h4  i8->h4 
##    3.82   -4.88   -6.99    3.90   14.91   -2.78    2.97   -2.55   -0.09 
##  i9->h4 i10->h4 i11->h4 i12->h4 i13->h4 i14->h4 i15->h4 i16->h4 i17->h4 
##   11.15   -6.99   -5.22   -5.00    5.65    4.78    1.92    3.44   17.07 
## i18->h4 i19->h4 i20->h4 
##   -0.45    6.02    5.42 
##   b->h5  i1->h5  i2->h5  i3->h5  i4->h5  i5->h5  i6->h5  i7->h5  i8->h5 
##  -16.05    0.27    0.90    0.04  -41.12   16.07  -11.46    8.25    5.92 
##  i9->h5 i10->h5 i11->h5 i12->h5 i13->h5 i14->h5 i15->h5 i16->h5 i17->h5 
##    1.58   -0.72    0.02    0.12    5.46    5.42  -23.98   33.00  -18.51 
## i18->h5 i19->h5 i20->h5 
##   36.77    3.97   17.34 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o 
##  38.65 -28.86  27.91 -45.46 -12.08 -39.70
mush.predict3 <- predict(net3, mushtestset, type = "class")

net.table3 <- table(mushtest$edibility, mush.predict3)
net.table3
##    mush.predict3
##       e   p
##   e 841   0
##   p  14 769
caret::confusionMatrix(net.table3)
## Confusion Matrix and Statistics
## 
##    mush.predict3
##       e   p
##   e 841   0
##   p  14 769
##                                           
##                Accuracy : 0.9914          
##                  95% CI : (0.9856, 0.9953)
##     No Information Rate : 0.5265          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9827          
##                                           
##  Mcnemar's Test P-Value : 0.000512        
##                                           
##             Sensitivity : 0.9836          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9821          
##              Prevalence : 0.5265          
##          Detection Rate : 0.5179          
##    Detection Prevalence : 0.5179          
##       Balanced Accuracy : 0.9918          
##                                           
##        'Positive' Class : e               
## 

Now onto using the neuralnet package. For the first fit I chose the default 1 layer with 5 hidden neurons, you can have up to 3 layers. I used the activation function with the default logistic, without the activation function our neural network would not be able to learn and model complicated kinds of data such as images, videos, speech, and audio. To calculate the erro “ce” is used in place of the default “sse” since we are calculating binary outcomes. We set the linear-output to FALSE since we don’t want to ignore our activation function, otherwise the default is TRUE.

For fit2 I built 2 layers with 5 hidden neurons in each layer

For fit3 I built 3 layers with 5 hidden neurons in each layer.

n <- names(mushtrain)
form <- as.formula(paste("edibility~", paste(n[!n %in% "edibility"], collapse = "+")))
form
## edibility ~ cap_shape + cap_surface + cap_color + bruises + odor + 
##     grill_attachment + grill_spacing + grill_size + grill_color + 
##     stalk_shape + stalk_surface_above_ring + stalk_surface_below_ring + 
##     stalk_color_above_ring + stalk_color_below_ring + veil_color + 
##     ring_number + ring_type + spore_print_color + population + 
##     habitat
fit <- neuralnet(form, 
                data = mushtrain,
                err.fct = "ce",
                hidden=c(4),
                act.fct = "logistic",
                linear.output = FALSE
                )  
plot(fit, rep = "best")

fit2 = neuralnet(form,
                 data=mushtrain,
                 hidden=c(5,5),
                 act.fct = "logistic",
                 err.fct="ce",
                 linear.output=FALSE
                 )
plot(fit2, rep = "best")