## Loading required package: e1071
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Loading required package: kernlab
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha

I named the columns and then converted them from factors to numeric as depicted by str() performed before and after transformation. I also dropped veil_type(V17) since it only has one level and may create errors.

str(mushroom)
## 'data.frame':    8124 obs. of  23 variables:
##  $ V1 : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ V2 : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ V3 : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
##  $ V4 : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
##  $ V5 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
##  $ V6 : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ V7 : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
##  $ V8 : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
##  $ V9 : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
##  $ V10: Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
##  $ V11: Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
##  $ V12: Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
##  $ V13: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ V14: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ V15: Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ V16: Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ V17: Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
##  $ V18: Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ V19: Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
##  $ V20: Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
##  $ V21: Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ V22: Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
##  $ V23: Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
colnames(mushroom) <- c("edibility", "cap_shape", "cap_surface", "cap_color", "bruises", "odor", "grill_attachment", "grill_spacing", "grill_size", "grill_color", "stalk_shape", "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring", "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", "ring_type", "spore_print_color", "population", "habitat")

sum(is.na(mushroom))
## [1] 0
mushroom <- subset(mushroom, select = -veil_type)
mushroom$cap_shape <- as.numeric(mushroom$cap_shape)
mushroom$cap_surface <- as.numeric(mushroom$cap_surface)
mushroom$cap_color <- as.numeric(mushroom$cap_color)
mushroom$bruises <- as.numeric(mushroom$bruises)
mushroom$odor <- as.numeric(mushroom$odor)
mushroom$grill_attachment <- as.numeric(mushroom$grill_attachment)
mushroom$grill_spacing <- as.numeric(mushroom$grill_spacing)
mushroom$grill_size <- as.numeric(mushroom$grill_size)
mushroom$grill_color <- as.numeric(mushroom$grill_color)
mushroom$stalk_shape <- as.numeric(mushroom$stalk_shape)
mushroom$stalk_root <- as.numeric(mushroom$stalk_root)
mushroom$stalk_surface_above_ring <- as.numeric(mushroom$cap_shape)
mushroom$stalk_surface_below_ring <- as.numeric(mushroom$cap_shape)
mushroom$stalk_color_above_ring <- as.numeric(mushroom$stalk_color_above_ring)
mushroom$stalk_color_below_ring <- as.numeric(mushroom$stalk_color_below_ring)
mushroom$veil_color <- as.numeric(mushroom$veil_color)
mushroom$ring_number <- as.numeric(mushroom$ring_number)
mushroom$ring_type <- as.numeric(mushroom$ring_type)
mushroom$spore_print_color <- as.numeric(mushroom$spore_print_color)
mushroom$population <- as.numeric(mushroom$population)
mushroom$habitat <- as.numeric(mushroom$habitat)
str(mushroom)
## 'data.frame':    8124 obs. of  22 variables:
##  $ edibility               : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap_shape               : num  6 6 1 6 6 6 1 1 6 1 ...
##  $ cap_surface             : num  3 3 3 4 3 4 3 4 4 3 ...
##  $ cap_color               : num  5 10 9 9 4 10 9 9 9 10 ...
##  $ bruises                 : num  2 2 2 2 1 2 2 2 2 2 ...
##  $ odor                    : num  7 1 4 7 6 1 1 4 7 1 ...
##  $ grill_attachment        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ grill_spacing           : num  1 1 1 1 2 1 1 1 1 1 ...
##  $ grill_size              : num  2 1 1 2 1 1 1 1 2 1 ...
##  $ grill_color             : num  5 5 6 6 5 6 3 6 8 3 ...
##  $ stalk_shape             : num  1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root              : num  4 3 3 4 4 3 3 3 4 3 ...
##  $ stalk_surface_above_ring: num  6 6 1 6 6 6 1 1 6 1 ...
##  $ stalk_surface_below_ring: num  6 6 1 6 6 6 1 1 6 1 ...
##  $ stalk_color_above_ring  : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ stalk_color_below_ring  : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ veil_color              : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ ring_number             : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ ring_type               : num  5 5 5 5 1 5 5 5 5 5 ...
##  $ spore_print_color       : num  3 4 4 3 4 3 3 4 3 3 ...
##  $ population              : num  4 3 3 4 1 3 3 4 5 4 ...
##  $ habitat                 : num  6 2 4 6 2 2 4 4 2 4 ...

We see that 52% of mushrooms are edible and 48% are poisionous. I then split the data 80/20 for train and test respectively.

4208/8124
## [1] 0.5179714
3916/8124
## [1] 0.4820286
.8 * 8124
## [1] 6499.2
.2 * 8124
## [1] 1624.8
sam <- sample(8124, 6499)
mush_train <- mushroom[sam, ]
mush_test <- mushroom[-sam, ]
dim(mush_train)
## [1] 6499   22
dim(mush_test)
## [1] 1625   22
str(mush_train)
## 'data.frame':    6499 obs. of  22 variables:
##  $ edibility               : Factor w/ 2 levels "e","p": 1 1 1 1 2 1 2 2 2 2 ...
##  $ cap_shape               : num  3 6 3 6 6 6 6 6 4 4 ...
##  $ cap_surface             : num  4 1 3 4 4 4 4 4 1 4 ...
##  $ cap_color               : num  10 5 5 4 5 2 5 10 10 3 ...
##  $ bruises                 : num  2 1 1 2 1 2 1 1 1 1 ...
##  $ odor                    : num  4 6 6 6 9 6 3 3 6 3 ...
##  $ grill_attachment        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ grill_spacing           : num  1 2 2 1 1 1 1 1 1 1 ...
##  $ grill_size              : num  1 1 1 1 2 1 2 1 2 2 ...
##  $ grill_color             : num  11 6 6 10 1 11 1 8 11 1 ...
##  $ stalk_shape             : num  1 2 2 2 2 1 2 1 1 2 ...
##  $ stalk_root              : num  5 4 4 2 1 2 1 2 1 1 ...
##  $ stalk_surface_above_ring: num  3 6 3 6 6 6 6 6 4 4 ...
##  $ stalk_surface_below_ring: num  3 6 3 6 6 6 6 6 4 4 ...
##  $ stalk_color_above_ring  : num  8 8 8 4 8 8 7 7 8 7 ...
##  $ stalk_color_below_ring  : num  8 8 8 4 7 8 8 1 5 7 ...
##  $ veil_color              : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ ring_number             : num  2 2 2 2 2 3 2 2 2 2 ...
##  $ ring_type               : num  5 1 1 5 1 5 1 3 1 1 ...
##  $ spore_print_color       : num  4 3 3 3 8 8 8 2 8 8 ...
##  $ population              : num  6 1 1 5 5 6 5 5 5 5 ...
##  $ habitat                 : num  5 2 2 1 3 5 3 5 1 3 ...

With the training data I first ran the linear kernel “vanilladot”. Which, created a model that is 98% accurate. When variables can be separated perfectly by a straight line or two dimensional flat surface this would be the best fit, and has the speediest output since it really is not a kernel and uses theta transposing. It is said that the linear kernel is best used when number of features is larger than number of observations. If number of observations is larger than 50,000, however speed could be an issue when using the gaussian kernel so then one might want to use the linear kernel.

mush_classifier <- ksvm(edibility ~ ., data = mush_train,
kernel = "vanilladot")
##  Setting default kernel parameters
print(mush_classifier)
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 1696 
## 
## Objective Function Value : -944.8318 
## Training error : 0.024619
mush_predictions <- predict(mush_classifier, mush_test)
agreement <- mush_predictions == mush_test$edibility
table(agreement)
## agreement
## FALSE  TRUE 
##    34  1591
prop.table(table(agreement))
## agreement
##      FALSE       TRUE 
## 0.02092308 0.97907692

It looks like both the polynomial(“polydot”) and linear kernels produce the same results at 98% accuracy. However, since the polynomial kernel is seen as computationally and predictively inefficient it is not commonly used. If the Gaussian kernel isn’t the best fit we would stick with the linear kernel.

mush_classifier_poly <- ksvm(edibility ~ ., data = mush_train,
kernel = "polydot")
##  Setting default kernel parameters
print(mush_classifier_poly)
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Polynomial kernel function. 
##  Hyperparameters : degree =  1  scale =  1  offset =  1 
## 
## Number of Support Vectors : 1722 
## 
## Objective Function Value : -944.8318 
## Training error : 0.024619
mush_predictions_poly <- predict(mush_classifier_poly, mush_test)
agreement_poly <- mush_predictions_poly == mush_test$edibility
table(agreement_poly)
## agreement_poly
## FALSE  TRUE 
##    34  1591
prop.table(table(agreement_poly))
## agreement_poly
##      FALSE       TRUE 
## 0.02092308 0.97907692

However, Gaussian is 100% so it is the best fit. Also known as the “radial basis function” kernel it typically will always beat the linear kernel in accuracy. The popular “iris” dataset is an example of one that does best with the linear kernel for the sake of speed, but they tie in accuracy.

mush_classifier_rbf <- ksvm(edibility ~ ., data = mush_train,
kernel = "rbf")
print(mush_classifier_rbf)
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.037623535746491 
## 
## Number of Support Vectors : 384 
## 
## Objective Function Value : -137.4535 
## Training error : 0
mush_predictions_rbs <- predict(mush_classifier_rbf, mush_test)
agreement_rbf <- mush_predictions_rbs == mush_test$edibility
table(agreement_rbf)
## agreement_rbf
## TRUE 
## 1625
prop.table(table(agreement_rbf))
## agreement_rbf
## TRUE 
##    1