## Loading required package: tidyverse
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Registered S3 method overwritten by 'rvest':
##   method            from
##   read_xml.response xml2
## -- Attaching packages -------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0
## -- Conflicts ----------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: cluster
## Loading required package: factoextra
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Loading required package: animation
## Loading required package: RColorBrewer
## Loading required package: dendextend
## 
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
##   Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1       2      3 12669 9656    7561    214             2674       1338
## 2       2      3  7057 9810    9568   1762             3293       1776
## 3       2      3  6353 8808    7684   2405             3516       7844
## 4       1      3 13265 1196    4221   6404              507       1788
## 5       2      3 22615 5410    7198   3915             1777       5185
## 6       2      3  9413 8259    5126    666             1795       1451

I made a box plot to cehck for any outliers, and since there are some so I set parameters to leave them out after reviewing histograms for the individual variables. I also made sure to omit any missing values and dropped columns “channel” and “region” since they don’t contribute much. I then used scale() to standardize the data frame and set the mean to zero. I then plotted distance matrix using Euclidean distance to check out correlation.

str(customers)
## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
boxplot(customers)

hist(customers$Fresh)

hist(customers$Milk)

hist(customers$Grocery)

hist(customers$Frozen)

hist(customers$Detergents_Paper)

hist(customers$Delicassen)

customers2<- subset(customers, Channel & Region & Fresh<30000 &
                            Milk<20000 & Grocery<20000 & Frozen<5000
                          & Detergents_Paper<10000 & Delicassen<3000)


customers3<-customers2[-1:-2]

customers3 <- na.omit(customers3)

customers4 <- scale(customers3)
summary(customers4)
##      Fresh              Milk            Grocery            Frozen       
##  Min.   :-1.2220   Min.   :-1.2000   Min.   :-1.2034   Min.   :-1.2248  
##  1st Qu.:-0.8335   1st Qu.:-0.8213   1st Qu.:-0.7793   1st Qu.:-0.7903  
##  Median :-0.2245   Median :-0.2815   Median :-0.3969   Median :-0.3169  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6044   3rd Qu.: 0.6582   3rd Qu.: 0.6671   3rd Qu.: 0.6532  
##  Max.   : 2.8139   Max.   : 3.5664   Max.   : 2.9895   Max.   : 2.6749  
##  Detergents_Paper    Delicassen     
##  Min.   :-0.8354   Min.   :-1.2938  
##  1st Qu.:-0.7375   1st Qu.:-0.7963  
##  Median :-0.5201   Median :-0.2981  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6132   3rd Qu.: 0.6326  
##  Max.   : 3.3708   Max.   : 2.8691
boxplot(customers4)

customers_cor<- cor(customers4)
customers_cor
##                        Fresh       Milk     Grocery      Frozen
## Fresh             1.00000000 -0.1253835 -0.07145758  0.25694093
## Milk             -0.12538349  1.0000000  0.76039355 -0.16488160
## Grocery          -0.07145758  0.7603936  1.00000000 -0.16682531
## Frozen            0.25694093 -0.1648816 -0.16682531  1.00000000
## Detergents_Paper -0.19251717  0.7145396  0.85841788 -0.15581614
## Delicassen        0.15628415  0.2682731  0.27976701  0.09886296
##                  Detergents_Paper Delicassen
## Fresh                  -0.1925172 0.15628415
## Milk                    0.7145396 0.26827313
## Grocery                 0.8584179 0.27976701
## Frozen                 -0.1558161 0.09886296
## Detergents_Paper        1.0000000 0.16623513
## Delicassen              0.1662351 1.00000000
distance <- get_dist(customers_cor)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

The R software uses 10 as the default value for the maximum number of iterations. An nstart of 25 is recommended and this serves as the number of initial configurations. According to the elbow method 4 looks to be the optimal number of clusters, and 2 maximizes the average silhoutette values for the average silhoutete method. However, the gap statistics recommended 10 clusters.

set.seed(34)
k2 <- kmeans(customers4, centers = 2, nstart = 25)
str(k2)
## List of 9
##  $ cluster     : Named int [1:284] 2 2 2 1 2 1 2 2 1 2 ...
##   ..- attr(*, "names")= chr [1:284] "1" "2" "6" "7" ...
##  $ centers     : num [1:2, 1:6] 0.0872 -0.1975 -0.5173 1.1714 -0.5453 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:6] "Fresh" "Milk" "Grocery" "Frozen" ...
##  $ totss       : num 1698
##  $ withinss    : num [1:2] 679 431
##  $ tot.withinss: num 1111
##  $ betweenss   : num 587
##  $ size        : int [1:2] 197 87
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = customers4)