require(caret)
require(rpart)
require(rpart.plot)
require(randomForest)
set.seed(1)

str(wine)
## 'data.frame':    1599 obs. of  12 variables:
##  $fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ... ##$ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ... ##$ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ... ##$ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ... ##$ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ... ##$ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ... ##$ quality             : int  5 5 5 6 5 5 5 7 7 5 ...

First we change the predictor variable “quality”" to a factor. Before the data is split it is important to note that there are no “na” values. The integer ratings in data frame range for 3 to 8 so in the histogram ratings at level 3 and 4 represent ratings 5 and 6 in the data frame. Now we can split the data using the decision tree classification method, the wine is classified into six levels based on its properties. The samples are randomized so it is important to set the seed and then we split the data into 80% for training and 20% for testing respectively to create two randomized samples of the data.

In this model “no” will always be to the right and “yes” will always be to the left with each branch representing a decision for splitting the data into a new classification. The decision tree split the data into only 3 of the 6 available classifications: 5, 6 and 7 as indicated by the “unused” 3,4,and 8 in the key. We can see the furthest branches show that the prediction made a good amount of errors, and when tested on unseen data the predictions were only 57% accurate, which isn’t very good.

table(wine$quality) ## ## 3 4 5 6 7 8 ## 10 53 681 638 199 18 names(wine) ## [1] "fixed.acidity" "volatile.acidity" "citric.acid" ## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide" ## [7] "total.sulfur.dioxide" "density" "pH" ## [10] "sulphates" "alcohol" "quality" sum(is.na(wine)) ## [1] 0 wine$quality <- as.factor(wine$quality) str(wine$quality)
##  Factor w/ 6 levels "3","4","5","6",..: 3 3 3 4 3 3 3 5 5 3 ...
mixture <- as.numeric(wine$quality) hist(mixture) .8 * 1599 ## [1] 1279.2 s <- sample(1599, 1279) wine_train <- wine[s, ] wine_test <- wine[-s, ] dim(wine_train) ## [1] 1279 12 dim(wine_test) ## [1] 320 12 tm <- rpart(quality~., wine_train, method = "class") rpart.plot(tm, tweak = .9) rpart.plot(tm, type = 4, extra = 101, tweak = .9) pred <- predict(tm, wine_test, type = "class") table(wine_test$quality, pred)
##    pred
##      3  4  5  6  7  8
##   3  0  0  3  1  0  0
##   4  0  0  9  2  0  0
##   5  0  0 90 34  1  0
##   6  0  0 48 73 14  0
##   7  0  0  6 17 19  0
##   8  0  0  0  1  2  0
confusionMatrix(table(pred, wine_test$quality)) ## Confusion Matrix and Statistics ## ## ## pred 3 4 5 6 7 8 ## 3 0 0 0 0 0 0 ## 4 0 0 0 0 0 0 ## 5 3 9 90 48 6 0 ## 6 1 2 34 73 17 1 ## 7 0 0 1 14 19 2 ## 8 0 0 0 0 0 0 ## ## Overall Statistics ## ## Accuracy : 0.5688 ## 95% CI : (0.5125, 0.6237) ## No Information Rate : 0.4219 ## P-Value [Acc > NIR] : 8.847e-08 ## ## Kappa : 0.3112 ## ## Mcnemar's Test P-Value : NA ## ## Statistics by Class: ## ## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8 ## Sensitivity 0.0000 0.00000 0.7200 0.5407 0.45238 0.000000 ## Specificity 1.0000 1.00000 0.6615 0.7027 0.93885 1.000000 ## Pos Pred Value NaN NaN 0.5769 0.5703 0.52778 NaN ## Neg Pred Value 0.9875 0.96562 0.7866 0.6771 0.91901 0.990625 ## Prevalence 0.0125 0.03438 0.3906 0.4219 0.13125 0.009375 ## Detection Rate 0.0000 0.00000 0.2812 0.2281 0.05937 0.000000 ## Detection Prevalence 0.0000 0.00000 0.4875 0.4000 0.11250 0.000000 ## Balanced Accuracy 0.5000 0.50000 0.6908 0.6217 0.69561 0.500000 To help improve the power of the model we reduce the levels of classification from 6 to 3. Wines ranked at 7 and 8 become “good”, 5 and 6 became “normal”, and then 3 and 4 become “bad”. After splitting keeping 80% for testing we can not test the remaining data, which shows 88% accuracy and is a lot better than the decision tree. wine2 <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"), header = TRUE, sep = ";") set.seed(1) barplot(table(wine2$quality))

wine2$taste <- ifelse(wine2$quality < 5, "bad", "good")
wine2$taste[wine2$quality == 5] <- "normal"
wine2$taste[wine2$quality == 6] <- "normal"
wine2$taste <- as.factor(wine2$taste)
str(wine2$taste) ## Factor w/ 3 levels "bad","good","normal": 3 3 3 3 3 3 3 2 2 3 ... barplot(table(wine2$taste))

table(wine2$taste) ## ## bad good normal ## 63 217 1319 samp <- sample(1599, 1279) wine_train2 <- wine2[samp, ] wine_test2 <- wine2[-samp, ] dim(wine_train2) ## [1] 1279 13 dim(wine_test2) ## [1] 320 13 model <- randomForest(taste ~ . - quality, data = wine_train2) model ## ## Call: ## randomForest(formula = taste ~ . - quality, data = wine_train2) ## Type of random forest: classification ## Number of trees: 500 ## No. of variables tried at each split: 3 ## ## OOB estimate of error rate: 12.28% ## Confusion matrix: ## bad good normal class.error ## bad 0 1 47 1.00000000 ## good 0 88 84 0.48837209 ## normal 0 25 1034 0.02360718 prediction <- predict(model, newdata = wine_test2) table(prediction, wine_test2$taste)
##
##     normal  15   20    255
(0 + 25 + 255) / nrow(wine_test2)
## [1] 0.875