require(class)
## Loading required package: class
require(gmodels)
## Loading required package: gmodels
Heart <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"), header = FALSE)
  1. After pulling in the dataset I named the columns for the 14 attributes that are most commonly used even though there are 76 listed. It is mention that there are questions marks in the “thal” and “cp” columns so those patients were removed. Since num was the output we wanted to predict we then change it to a factor and since we only want to know if a patient has a heart disease or not we dismiss the severity and set the dummy variables to 0= no disease or 1= disease. This is done by changing the severity numbers 2, 3, and 4 to = 1. Also, since thal and ca are not the predictor variable they must be turned into number variables.
head(Heart)
##   V1 V2 V3  V4  V5 V6 V7  V8 V9 V10 V11 V12 V13 V14
## 1 63  1  1 145 233  1  2 150  0 2.3   3 0.0 6.0   0
## 2 67  1  4 160 286  0  2 108  1 1.5   2 3.0 3.0   2
## 3 67  1  4 120 229  0  2 129  1 2.6   2 2.0 7.0   1
## 4 37  1  3 130 250  0  0 187  0 3.5   3 0.0 3.0   0
## 5 41  0  2 130 204  0  2 172  0 1.4   1 0.0 3.0   0
## 6 56  1  2 120 236  0  0 178  0 0.8   1 0.0 3.0   0
str(Heart)
## 'data.frame':    303 obs. of  14 variables:
##  $ V1 : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ V2 : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ V3 : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ V4 : num  145 160 120 130 130 120 140 120 130 140 ...
##  $ V5 : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ V6 : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ V7 : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ V8 : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ V9 : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ V10: num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ V11: num  3 2 2 3 1 1 3 1 2 3 ...
##  $ V12: Factor w/ 5 levels "?","0.0","1.0",..: 2 5 4 2 2 2 4 2 3 2 ...
##  $ V13: Factor w/ 4 levels "?","3.0","6.0",..: 3 2 4 2 2 2 2 2 4 4 ...
##  $ V14: int  0 2 1 0 0 0 3 0 2 1 ...
colnames(Heart) <- c("age","sex","cp","trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")
head(Heart)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope  ca
## 1  63   1  1      145  233   1       2     150     0     2.3     3 0.0
## 2  67   1  4      160  286   0       2     108     1     1.5     2 3.0
## 3  67   1  4      120  229   0       2     129     1     2.6     2 2.0
## 4  37   1  3      130  250   0       0     187     0     3.5     3 0.0
## 5  41   0  2      130  204   0       2     172     0     1.4     1 0.0
## 6  56   1  2      120  236   0       0     178     0     0.8     1 0.0
##   thal num
## 1  6.0   0
## 2  3.0   2
## 3  7.0   1
## 4  3.0   0
## 5  3.0   0
## 6  3.0   0
class(Heart$num)
## [1] "integer"
Heart[Heart == "?"] <- NA
Heart <- na.omit(Heart)
sum(is.na(Heart))
## [1] 0
Heart$num[Heart$num=="4"] <- "1"
Heart$num[Heart$num=="3"] <- "1"
Heart$num[Heart$num=="2"] <- "1"
Heart$num <- as.factor(Heart$num)

class(Heart$num)
## [1] "factor"
head(Heart$num)
## [1] 0 1 1 0 0 0
## Levels: 0 1
round(prop.table(table(Heart$num)) * 100, digits = 1)
## 
##    0    1 
## 53.9 46.1
Heart$thal <- as.character(Heart$thal)
Heart$thal <- as.numeric(Heart$thal)
str(Heart)
## 'data.frame':    297 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : Factor w/ 5 levels "?","0.0","1.0",..: 2 5 4 2 2 2 4 2 3 2 ...
##  $ thal    : num  6 3 7 3 3 3 3 3 7 7 ...
##  $ num     : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 2 1 2 2 ...
##  - attr(*, "na.action")= 'omit' Named int  88 167 193 267 288 303
##   ..- attr(*, "names")= chr  "88" "167" "193" "267" ...
Heart$ca <- as.character(Heart$ca)
Heart$ca <- as.numeric(Heart$ca)
str(Heart)
## 'data.frame':    297 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : num  0 3 2 0 0 0 2 0 1 0 ...
##  $ thal    : num  6 3 7 3 3 3 3 3 7 7 ...
##  $ num     : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 2 1 2 2 ...
##  - attr(*, "na.action")= 'omit' Named int  88 167 193 267 288 303
##   ..- attr(*, "names")= chr  "88" "167" "193" "267" ...
  1. Before dividing the data we normalize it. The way we can tell that the normalization is working correctly is that there are two sets of numbers with one group being exactly 10x’s the amount of the firts, but after we normalize the two the computation is the same for both. This means it is safe to apply the function to the data frame and normalize all numeric features. We can now see the min for age is 0 and max is 1.

The data is now ready to be split into testing(randomn selection of 100 observations) and training(randomn selection of 197 observations) sets. After this is completed we test the model by comparing the predictions to the observations we have in the test data. We can see that if K=1 the error rate is 23%.

summary(Heart)
##       age             sex               cp           trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :1.000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :3.000   Median :130.0  
##  Mean   :54.54   Mean   :0.6768   Mean   :3.158   Mean   :131.7  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :4.000   Max.   :200.0  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.0  
##  Median :243.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :247.4   Mean   :0.1448   Mean   :0.9966   Mean   :149.6  
##  3rd Qu.:276.0   3rd Qu.:0.0000   3rd Qu.:2.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :2.000   Median :0.0000  
##  Mean   :0.3266   Mean   :1.056   Mean   :1.603   Mean   :0.6768  
##  3rd Qu.:1.0000   3rd Qu.:1.600   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :3.000   Max.   :3.0000  
##       thal       num    
##  Min.   :3.000   0:160  
##  1st Qu.:3.000   1:137  
##  Median :3.000          
##  Mean   :4.731          
##  3rd Qu.:7.000          
##  Max.   :7.000
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

normalize(c(1,2,3,4,5))
## [1] 0.00 0.25 0.50 0.75 1.00
normalize(c(10, 20, 30, 40, 50))
## [1] 0.00 0.25 0.50 0.75 1.00
Heart[1:13] <- as.data.frame(lapply(Heart[1:13], normalize))

summary(Heart$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3958  0.5625  0.5321  0.6667  1.0000
set.seed(88)

ind <- sample(2, nrow(Heart), replace=TRUE, prob=c(0.7, 0.3))
train <- Heart[ind==1,]
test <- Heart[ind==2,]
  1. Swapping out K values to = 5, 14, and 10 we can try to find the best fit. With k=5 the error rate is 22%, with k=14 the error rate is 20%, and with k=10 the error rate is 19%. In comparision it seems that K=10 is the best fit, however it is still not close enough to diagnose heart disease.
pred <- knn(train = train[1:13], test = test[1:13], cl = train$num, k = 1)

CrossTable(x = test$num, y = pred, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  88 
## 
##  
##              | pred 
##     test$num |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        46 |         6 |        52 | 
##              |     0.885 |     0.115 |     0.591 | 
##              |     0.767 |     0.214 |           | 
##              |     0.523 |     0.068 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        14 |        22 |        36 | 
##              |     0.389 |     0.611 |     0.409 | 
##              |     0.233 |     0.786 |           | 
##              |     0.159 |     0.250 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        60 |        28 |        88 | 
##              |     0.682 |     0.318 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
(46 + 22) / (46 + 22 + 14 + 6)
## [1] 0.7727273
(14 + 6) / (46 + 22 + 14 + 6)
## [1] 0.2272727
pred_2 <- knn(train = train[1:13], test = test[1:13], cl = train$num, k = 5)
CrossTable(x = test$num, y = pred_2, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  88 
## 
##  
##              | pred_2 
##     test$num |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        46 |         6 |        52 | 
##              |     0.885 |     0.115 |     0.591 | 
##              |     0.780 |     0.207 |           | 
##              |     0.523 |     0.068 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        13 |        23 |        36 | 
##              |     0.361 |     0.639 |     0.409 | 
##              |     0.220 |     0.793 |           | 
##              |     0.148 |     0.261 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        59 |        29 |        88 | 
##              |     0.670 |     0.330 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
(46 + 23) / (46 + 23 + 13 + 6)
## [1] 0.7840909
(13 + 6) / (46 + 23 + 13 + 6)
## [1] 0.2159091
pred_3 <- knn(train = train[1:13], test = test[1:13], cl = train$num, k = 14)
CrossTable(x = test$num, y = pred_3, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  88 
## 
##  
##              | pred_3 
##     test$num |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        46 |         6 |        52 | 
##              |     0.885 |     0.115 |     0.591 | 
##              |     0.793 |     0.200 |           | 
##              |     0.523 |     0.068 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        12 |        24 |        36 | 
##              |     0.333 |     0.667 |     0.409 | 
##              |     0.207 |     0.800 |           | 
##              |     0.136 |     0.273 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        58 |        30 |        88 | 
##              |     0.659 |     0.341 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
(46 + 24) / (46 + 24 + 12 + 6)
## [1] 0.7954545
(12 + 6) / (46 + 24 + 12 + 6)
## [1] 0.2045455
pred_3 <- knn(train = train[1:13], test = test[1:13], cl = train$num, k = 10)
CrossTable(x = test$num, y = pred_3, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  88 
## 
##  
##              | pred_3 
##     test$num |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        47 |         5 |        52 | 
##              |     0.904 |     0.096 |     0.591 | 
##              |     0.797 |     0.172 |           | 
##              |     0.534 |     0.057 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        12 |        24 |        36 | 
##              |     0.333 |     0.667 |     0.409 | 
##              |     0.203 |     0.828 |           | 
##              |     0.136 |     0.273 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        59 |        29 |        88 | 
##              |     0.670 |     0.330 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
(47 + 24) / (47 + 24 + 5 + 12)
## [1] 0.8068182
(5 + 12) / (47 + 24 + 5 + 12)
## [1] 0.1931818