require(h2oEnsemble)
## Loading required package: h2oEnsemble
## Loading required package: h2o
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
## h2oEnsemble R package for H2O-3
## Version: 0.2.1
## Package created on 2017-08-02
require(h2o)
require(rio)
## Loading required package: rio
require(doParallel)
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
require(viridis)
## Loading required package: viridis
## Loading required package: viridisLite
require(RColorBrewer)
## Loading required package: RColorBrewer
require(ggthemes)
## Loading required package: ggthemes
require(plotly)
## Loading required package: plotly
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:rio':
## 
##     export
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
require(lime)
## Loading required package: lime
require(plotROC)
## Loading required package: plotROC
require(pROC)
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:plotROC':
## 
##     ggroc
## The following object is masked from 'package:h2o':
## 
##     var
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
require(bit64)
## Loading required package: bit64
## Loading required package: bit
## Attaching package bit
## package:bit (c) 2008-2012 Jens Oehlschlaegel (GPL-2)
## creators: bit bitwhich
## coercion: as.logical as.integer as.bit as.bitwhich which
## operator: ! & | xor != ==
## querying: print length any all min max range sum summary
## bit access: length<- [ [<- [[ [[<-
## for more help type ?bit
## 
## Attaching package: 'bit'
## The following object is masked from 'package:base':
## 
##     xor
## Attaching package bit64
## package:bit64 (c) 2011-2012 Jens Oehlschlaegel
## creators: integer64 seq :
## coercion: as.integer64 as.vector as.logical as.integer as.double as.character as.bin
## logical operator: ! & | xor != == < <= >= >
## arithmetic operator: + - * / %/% %% ^
## math: sign abs sqrt log log2 log10
## math: floor ceiling trunc round
## querying: is.integer64 is.vector [is.atomic} [length] format print str
## values: is.na is.nan is.finite is.infinite
## aggregation: any all min max range sum prod
## cumulation: diff cummin cummax cumsum cumprod
## access: length<- [ [<- [[ [[<-
## combine: c rep cbind rbind as.data.frame
## WARNING don't use as subscripts
## WARNING semantics differ from integer
## for more help type ?bit64
## 
## Attaching package: 'bit64'
## The following object is masked from 'package:bit':
## 
##     still.identical
## The following object is masked from 'package:h2o':
## 
##     %in%
## The following objects are masked from 'package:base':
## 
##     %in%, :, is.double, match, order, rank
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:lime':
## 
##     explain
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
delay<-read.csv("C:/Users/12488/Documents/R/delay.csv", sep=",", header=TRUE, stringsAsFactors = TRUE)
# earlier departures have negative departure times.  https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236
# Cancelled Flight Indicator (1=Yes), Diverted Flight Indicator (1=Yes)
delay$X<-NULL

delay$DAY_OF_WEEK<-NULL

delay$DAY_OF_MONTH <- as.factor(delay$DAY_OF_MONTH)
str(delay)
## 'data.frame':    583985 obs. of  16 variables:
##  $ DAY_OF_MONTH     : Factor w/ 31 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ OP_CARRIER       : Factor w/ 17 levels "9E","AA","AS",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ TAIL_NUM         : Factor w/ 5447 levels "","215NV","216NV",..: 4321 1401 4477 4473 4534 4879 4753 1096 1337 1184 ...
##  $ OP_CARRIER_FL_NUM: int  3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 ...
##  $ ORIGIN_AIRPORT_ID: int  11953 13487 11433 15249 10397 11267 12448 12953 12451 10397 ...
##  $ ORIGIN           : Factor w/ 346 levels "ABE","ABI","ABQ",..: 129 226 96 326 20 86 169 190 170 20 ...
##  $ DEST_AIRPORT_ID  : int  10397 11193 11193 10397 11778 13487 10397 11193 12953 10685 ...
##  $ DEST             : Factor w/ 346 levels "ABE","ABI","ABQ",..: 20 81 81 20 121 226 20 81 190 39 ...
##  $ DEP_TIME         : int  601 1359 1215 1521 1847 853 1553 1551 1037 1245 ...
##  $ DEP_DEL15        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DEP_TIME_BLK     : Factor w/ 19 levels "0001-0559","0600-0659",..: 2 10 8 11 15 5 11 11 6 8 ...
##  $ ARR_TIME         : int  722 1633 1329 1625 1940 953 1832 1824 1239 1318 ...
##  $ ARR_DEL15        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CANCELLED        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DIVERTED         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DISTANCE         : num  300 596 229 223 579 574 341 585 833 533 ...
delay2<-na.omit(delay)
str(delay2)
## 'data.frame':    565963 obs. of  16 variables:
##  $ DAY_OF_MONTH     : Factor w/ 31 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ OP_CARRIER       : Factor w/ 17 levels "9E","AA","AS",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ TAIL_NUM         : Factor w/ 5447 levels "","215NV","216NV",..: 4321 1401 4477 4473 4534 4879 4753 1096 1337 1184 ...
##  $ OP_CARRIER_FL_NUM: int  3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 ...
##  $ ORIGIN_AIRPORT_ID: int  11953 13487 11433 15249 10397 11267 12448 12953 12451 10397 ...
##  $ ORIGIN           : Factor w/ 346 levels "ABE","ABI","ABQ",..: 129 226 96 326 20 86 169 190 170 20 ...
##  $ DEST_AIRPORT_ID  : int  10397 11193 11193 10397 11778 13487 10397 11193 12953 10685 ...
##  $ DEST             : Factor w/ 346 levels "ABE","ABI","ABQ",..: 20 81 81 20 121 226 20 81 190 39 ...
##  $ DEP_TIME         : int  601 1359 1215 1521 1847 853 1553 1551 1037 1245 ...
##  $ DEP_DEL15        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DEP_TIME_BLK     : Factor w/ 19 levels "0001-0559","0600-0659",..: 2 10 8 11 15 5 11 11 6 8 ...
##  $ ARR_TIME         : int  722 1633 1329 1625 1940 953 1832 1824 1239 1318 ...
##  $ ARR_DEL15        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CANCELLED        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DIVERTED         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DISTANCE         : num  300 596 229 223 579 574 341 585 833 533 ...
##  - attr(*, "na.action")= 'omit' Named int [1:18022] 398 404 972 1251 1634 1867 1869 1880 1974 1976 ...
##   ..- attr(*, "names")= chr [1:18022] "398" "404" "972" "1251" ...
delay3<-as.data.frame(delay2)

delay4<-sample_n(delay3, 100000)

delay5<-na.omit(delay4)

str(delay5)
## 'data.frame':    100000 obs. of  16 variables:
##  $ DAY_OF_MONTH     : Factor w/ 31 levels "1","2","3","4",..: 8 1 25 4 22 27 17 13 3 9 ...
##  $ OP_CARRIER       : Factor w/ 17 levels "9E","AA","AS",..: 2 2 2 16 5 15 2 11 4 2 ...
##  $ TAIL_NUM         : Factor w/ 5447 levels "","215NV","216NV",..: 5441 5153 5304 4947 1474 1879 3993 2577 3664 3248 ...
##  $ OP_CARRIER_FL_NUM: int  1401 359 686 5943 1878 3996 1932 557 10 583 ...
##  $ ORIGIN_AIRPORT_ID: int  12889 10721 14107 13158 11953 10800 15016 10397 12954 11057 ...
##  $ ORIGIN           : Factor w/ 346 levels "ABE","ABI","ABQ",..: 179 42 252 200 129 53 317 20 191 69 ...
##  $ DEST_AIRPORT_ID  : int  11298 11298 14771 11298 10397 12889 11057 15304 10849 14524 ...
##  $ DEST             : Factor w/ 346 levels "ABE","ABI","ABQ",..: 90 90 298 90 20 179 69 328 55 277 ...
##  $ DEP_TIME         : int  1257 553 1845 1651 725 1021 1502 1427 1044 1356 ...
##  $ DEP_DEL15        : num  0 1 1 1 0 0 0 1 0 0 ...
##  $ DEP_TIME_BLK     : Factor w/ 19 levels "0001-0559","0600-0659",..: 8 1 14 12 3 6 11 8 6 10 ...
##  $ ARR_TIME         : int  1742 945 2013 1803 847 1122 1741 1614 1409 1453 ...
##  $ ARR_DEL15        : num  0 1 1 0 0 0 0 1 0 0 ...
##  $ CANCELLED        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DIVERTED         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DISTANCE         : num  1055 1562 651 309 300 ...
##  - attr(*, "na.action")= 'omit' Named int [1:18022] 398 404 972 1251 1634 1867 1869 1880 1974 1976 ...
##   ..- attr(*, "names")= chr [1:18022] "398" "404" "972" "1251" ...
delay6<-delay5%>%select(-ARR_DEL15, ARR_DEL15)
#delay6= delay6[, 1:8]
str(delay6)
## 'data.frame':    100000 obs. of  16 variables:
##  $ DAY_OF_MONTH     : Factor w/ 31 levels "1","2","3","4",..: 8 1 25 4 22 27 17 13 3 9 ...
##  $ OP_CARRIER       : Factor w/ 17 levels "9E","AA","AS",..: 2 2 2 16 5 15 2 11 4 2 ...
##  $ TAIL_NUM         : Factor w/ 5447 levels "","215NV","216NV",..: 5441 5153 5304 4947 1474 1879 3993 2577 3664 3248 ...
##  $ OP_CARRIER_FL_NUM: int  1401 359 686 5943 1878 3996 1932 557 10 583 ...
##  $ ORIGIN_AIRPORT_ID: int  12889 10721 14107 13158 11953 10800 15016 10397 12954 11057 ...
##  $ ORIGIN           : Factor w/ 346 levels "ABE","ABI","ABQ",..: 179 42 252 200 129 53 317 20 191 69 ...
##  $ DEST_AIRPORT_ID  : int  11298 11298 14771 11298 10397 12889 11057 15304 10849 14524 ...
##  $ DEST             : Factor w/ 346 levels "ABE","ABI","ABQ",..: 90 90 298 90 20 179 69 328 55 277 ...
##  $ DEP_TIME         : int  1257 553 1845 1651 725 1021 1502 1427 1044 1356 ...
##  $ DEP_DEL15        : num  0 1 1 1 0 0 0 1 0 0 ...
##  $ DEP_TIME_BLK     : Factor w/ 19 levels "0001-0559","0600-0659",..: 8 1 14 12 3 6 11 8 6 10 ...
##  $ ARR_TIME         : int  1742 945 2013 1803 847 1122 1741 1614 1409 1453 ...
##  $ CANCELLED        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DIVERTED         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DISTANCE         : num  1055 1562 651 309 300 ...
##  $ ARR_DEL15        : num  0 1 1 0 0 0 0 1 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:18022] 398 404 972 1251 1634 1867 1869 1880 1974 1976 ...
##   ..- attr(*, "names")= chr [1:18022] "398" "404" "972" "1251" ...
h2o.init(nthreads = -1, port = 54321, max_mem_size = "4g")
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 5 minutes 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.30.0.1 
##     H2O cluster version age:    22 days  
##     H2O cluster name:           H2O_started_from_R_12488_fhx057 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.08 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 4.0.0 (2020-04-24)
h2o.removeAll()
h2o.no_progress()

set.seed(55)


delay_h2o <- as.h2o(delay6)

# setting response and predictors
y <- "ARR_DEL15"
x <- setdiff(names(delay_h2o), y)

# response needs to be converted to factor for classification problem
delay_h2o[, y] <- as.factor(delay_h2o[, y])


split_df <- h2o.splitFrame(data = delay_h2o, ratios = c(0.7, 0.15), seed = 55)  

# train and test datasets
h2o_train <- split_df[[1]]  
h2o_valid <- split_df[[2]]
h2o_test <- split_df[[3]]
automl_model <- h2o.automl(x = x, 
                           y = y,
                           training_frame = h2o_train,
                           leaderboard_frame = h2o_test,
                           validation_frame = h2o_valid,
                           max_runtime_secs = 300,
                           seed = 55)
## 
## 18:50:21.867: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
## 18:50:21.868: AutoML: XGBoost is not available; skipping it.
require(tidyverse)
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.0     v purrr   0.3.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::accumulate() masks foreach::accumulate()
## x dplyr::explain()    masks lime::explain()
## x dplyr::filter()     masks plotly::filter(), stats::filter()
## x dplyr::lag()        masks stats::lag()
## x purrr::when()       masks foreach::when()
lb=automl_model@leaderboard
print(lb, n = nrow(lb))
##                                               model_id       auc   logloss
## 1           GBM_grid__1_AutoML_20200426_185021_model_3 0.9204506 0.2430895
## 2                         GBM_5_AutoML_20200426_185021 0.9202112 0.2381429
## 3  StackedEnsemble_BestOfFamily_AutoML_20200426_185021 0.9196326 0.2442572
## 4     StackedEnsemble_AllModels_AutoML_20200426_185021 0.9190143 0.2376983
## 5           GBM_grid__1_AutoML_20200426_185021_model_1 0.9187970 0.2434613
## 6                         GBM_3_AutoML_20200426_185021 0.9131283 0.2449401
## 7                         GBM_2_AutoML_20200426_185021 0.9127558 0.2452659
## 8                         GBM_4_AutoML_20200426_185021 0.9127346 0.2489269
## 9           GBM_grid__1_AutoML_20200426_185021_model_2 0.9115317 0.2557099
## 10                        GBM_1_AutoML_20200426_185021 0.9068020 0.2490369
## 11                        DRF_1_AutoML_20200426_185021 0.9010693 0.4437607
## 12               DeepLearning_1_AutoML_20200426_185021 0.8854885 0.2629767
## 13                        GLM_1_AutoML_20200426_185021 0.8818243 0.2624517
## 14          GBM_grid__1_AutoML_20200426_185021_model_4 0.8811591 0.3827688
## 15                        XRT_1_AutoML_20200426_185021 0.8764819 0.2728028
##        aucpr mean_per_class_error      rmse        mse
## 1  0.8241781            0.1501286 0.2578296 0.06647610
## 2  0.8149209            0.1445857 0.2575025 0.06630754
## 3  0.8223374            0.1456966 0.2583676 0.06675380
## 4  0.8287731            0.1422825 0.2555598 0.06531083
## 5  0.8139199            0.1482764 0.2586049 0.06687652
## 6  0.7978315            0.1474510 0.2605903 0.06790730
## 7  0.8013158            0.1500853 0.2609932 0.06811744
## 8  0.7968766            0.1475644 0.2621877 0.06874238
## 9  0.7910021            0.1503709 0.2659621 0.07073584
## 10 0.7893983            0.1466123 0.2622663 0.06878361
## 11 0.7780632            0.1481085 0.2650750 0.07026474
## 12 0.7509601            0.1490653 0.2675034 0.07155808
## 13 0.7515984            0.1486187 0.2669214 0.07124703
## 14 0.7507041            0.2008648 0.3417325 0.11678111
## 15 0.7460681            0.1482038 0.2685477 0.07211787
## 
## [15 rows x 7 columns]
h2o.performance(model = automl_model@leader,
                            newdata = h2o_test)
## H2OBinomialMetrics: gbm
## 
## MSE:  0.0664761
## RMSE:  0.2578296
## LogLoss:  0.2430895
## Mean Per-Class Error:  0.1501286
## AUC:  0.9204506
## AUCPR:  0.8241781
## Gini:  0.8409012
## R^2:  0.5601176
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##            0    1    Error         Rate
## 0      11762  493 0.040228   =493/12255
## 1        726 2066 0.260029    =726/2792
## Totals 12488 2559 0.081013  =1219/15047
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold        value idx
## 1                       max f1  0.320772     0.772192 219
## 2                       max f2  0.087440     0.781900 295
## 3                 max f0point5  0.608949     0.811914 144
## 4                 max accuracy  0.425626     0.919652 194
## 5                max precision  0.992636     1.000000   0
## 6                   max recall  0.003036     1.000000 398
## 7              max specificity  0.992636     1.000000   0
## 8             max absolute_mcc  0.320772     0.724055 219
## 9   max min_per_class_accuracy  0.062551     0.853868 313
## 10 max mean_per_class_accuracy  0.114027     0.862402 282
## 11                     max tns  0.992636 12255.000000   0
## 12                     max fns  0.992636  2788.000000   0
## 13                     max fps  0.002254 12255.000000 399
## 14                     max tps  0.003036  2792.000000 398
## 15                     max tnr  0.992636     1.000000   0
## 16                     max fnr  0.992636     0.998567   0
## 17                     max fpr  0.002254     1.000000 399
## 18                     max tpr  0.003036     1.000000 398
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`