## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(modelplotr)
library(kableExtra)

## ----pressure, echo=FALSE, fig.cap="Cartoon ROC plot", out.width = '100%'-----
knitr::include_graphics("https://modelplot.github.io/img/cartoonrocplot.jpg")

## ----decileplot, echo=FALSE, out.width = '100%'-------------------------------
knitr::include_graphics("https://modelplot.github.io/img/decileplot.png")

## ----cumgainsplot, echo=FALSE, out.width = '100%'-----------------------------
knitr::include_graphics("https://modelplot.github.io/img/cumgainsplot.png")

## ----cumliftplot, echo=FALSE, out.width = '100%'------------------------------
knitr::include_graphics("https://modelplot.github.io/img/cumliftplot.png")

## ----responseplot, echo=FALSE, out.width = '100%'-----------------------------
knitr::include_graphics("https://modelplot.github.io/img/responseplot.png")

## ----cumresponseplot, echo=FALSE, out.width = '100%'--------------------------
knitr::include_graphics("https://modelplot.github.io/img/cumresponseplot.png")

## ----costsrevsplot, echo=FALSE, out.width = '100%'----------------------------
knitr::include_graphics("https://modelplot.github.io/img/costsrevsplot.png")

## ----profitplot, echo=FALSE, out.width = '100%'-------------------------------
knitr::include_graphics("https://modelplot.github.io/img/profitplot.png")

## ----roiplot, echo=FALSE, out.width = '100%'----------------------------------
knitr::include_graphics("https://modelplot.github.io/img/roiplot.png")

## ----loaddata, echo=TRUE------------------------------------------------------
# load example data (Bank clients that have/have not subscribed a term deposit - see ?bank_td for details)
data("bank_td")

str(bank_td)

## ----trainmodels, echo=TRUE---------------------------------------------------
# prepare data for training model for binomial target has_td and train models
train_index =  base::sample(seq(1, nrow(bank_td)),size = 0.5*nrow(bank_td) ,replace = FALSE)
train = bank_td[train_index,c('has_td','duration','campaign','pdays','previous','euribor3m')]
test = bank_td[-train_index,c('has_td','duration','campaign','pdays','previous','euribor3m')]

#train models using mlr...
trainTask <- mlr::makeClassifTask(data = train, target = "has_td")
testTask <- mlr::makeClassifTask(data = test, target = "has_td")
mlr::configureMlr() # this line is needed when using mlr without loading it (mlr::)
task = mlr::makeClassifTask(data = train, target = "has_td")
lrn = mlr::makeLearner("classif.randomForest", predict.type = "prob")
rf = mlr::train(lrn, task)

#... or train models using caret...
# setting caret cross validation, here tuned for speed (not accuracy!)
fitControl <- caret::trainControl(method = "cv",number = 2,classProbs=TRUE)
# mnl model using glmnet package
mnl = caret::train(has_td ~.,data = train, method = "glmnet",trControl = fitControl)

## ----h2o_keras, echo=TRUE, eval=FALSE-----------------------------------------
#  #.. or train models using h2o... [NOT RUN]
#  h2o::h2o.init()
#  h2o::h2o.no_progress()
#  h2o_train = h2o::as.h2o(train)
#  h2o_test = h2o::as.h2o(test)
#  gbm <- h2o::h2o.gbm(y = "has_td",
#                            x = setdiff(colnames(train), "has_td"),
#                            training_frame = h2o_train,
#                            nfolds = 5)
#  
#  #.. or train models using keras...  [NOT RUN]
#  x_train <- as.matrix(train[,-1]); y=train[,1]; y_train <- keras::to_categorical(as.numeric(y)-1); `%>%` <- magrittr::`%>%`
#  nn <- keras::keras_model_sequential() %>%
#  keras::layer_dense(units = 16,kernel_initializer = "uniform", activation='relu',input_shape = NCOL(x_train)) %>%
#    keras::layer_dense(units = 16,kernel_initializer = "uniform", activation='relu') %>%
#    keras::layer_dense(units = length(levels(train[,1])),activation='softmax')
#  nn %>% keras::compile(optimizer = 'rmsprop',loss = 'categorical_crossentropy',metrics = c('accuracy'))
#  nn %>% keras::fit(x_train,y_train,epochs = 20,batch_size = 1028,verbose=0)
#  

## ----psn_params, echo=FALSE---------------------------------------------------
# prepare data
text_tbl <- data.frame(
  Parameter = c('datasets *','dataset_labels' , 'models *','model_labels','target_column *','ntiles'),
  `Type and Description` = c(
'List of Strings. A list of the names of the dataframe objects to include in model evaluation. All dataframes need to contain target variable and feature variables.',
'List of Strings. A list of labels for the datasets, user. When dataset_labels is not specified, the names from datasets are used.',
'List of Strings. Names of the model objects containing parameters to apply models to data. To use this function, model objects need to be generated by the mlr package or by the caret package or by the h20 package. Modelplotr automatically detects whether the model is built using mlr or caret or h2o.',
'List of Strings. Labels for the models to use in plots. When model_labels is not specified, the names from moddels are used.',
'String. Name of the target variable in datasets. Target can be either binary or multinomial. Continuous targets are not supported.',
'Integer. Number of ntiles. The ntile parameter represents the specified number of equally sized buckets the observations in each dataset are grouped into. By default, observations are grouped in 10 equally sized buckets, often referred to as deciles.'
  )
)

kable(text_tbl) %>%
  kableExtra::kable_styling(full_width = T,font_size = 10) %>%
  kableExtra::row_spec(c(1,3,5),italic = T)
  


## ----prepdata, echo=TRUE------------------------------------------------------

# prepare data (for h2o/keras: add "gbm" and "nn" to models and nice labels to model_labels params)

scores_and_ntiles <- prepare_scores_and_ntiles(datasets=list("train","test"),
                      dataset_labels = list("train data","test data"),
                      models = list("rf","mnl"),  
                      model_labels = list("random forest","multinomial logit"), 
                      target_column="has_td",
                      ntiles = 100)


## ----df_sd,echo=FALSE---------------------------------------------------------
scores_and_ntiles %>%
  head(5)%>%
  kable(row.names = FALSE) %>%
  kableExtra::kable_styling(font_size = 10,full_width = FALSE,position="left")

## ----scopes, echo=FALSE-------------------------------------------------------
# prepare data
text_tbl <- data.frame(
  Scope = c('"no_comparison" (default)','"compare_models"' , '"compare_datasets"','"compare_targetclasses"'),
  Description = c(
    "In this perspective, you're interested in the performance of one model on one dataset for one target class. Therefore, only one line is plotted in the plots. The parameters select_model_label, select_dataset_label and select_targetclass determine which group is plotted. When not specified, the first alphabetic model, the first alphabetic dataset and the smallest (when select_smallest_targetclass=TRUE) or first alphabetic target value are selected",
"In this perspective, you're interested in how well different models perform in comparison to each other on the same dataset and for the same target value. This results in a comparison between models available in ntiles_aggregate\\$model_label for a selected dataset (default: first alphabetic dataset) and for a selected target value (default: smallest (when select_smallest_targetclass=TRUE) or first alphabetic target value).",
"In this perspective, you're interested in how well a model performs in different datasets for a specific model on the same target value. This results in a comparison between datasets available in ntiles_aggregate\\$dataset_label for a selected model (default: first alphabetic model) and for a selected target value (default: smallest (when select_smallest_targetclass=TRUE) or first alphabetic target value).",
"In this perspective, you're interested in how well a model performs for different target values on a specific dataset.This resuls in a comparison between target classes available in ntiles_aggregate\\$target_class for a selected model (default: first alphabetic model) and for a selected dataset (default: first alphabetic dataset)."
  )
)

kable(text_tbl) %>%
  kableExtra::kable_styling(full_width = T,font_size = 10) %>%
  kableExtra::row_spec(1,italic = T)
  


## ----ps_params, echo=FALSE----------------------------------------------------
# prepare data
text_tbl <- data.frame(
  Parameter = c('prepared_input *','scope' , 'select_model_label','select_dataset_label','select_targetclass','select_smallest_targetclass'),
  `Type and Description` = c(
'Dataframe. Dataframe created with prepare_scores_and_ntiles or dataframe created with aggregate_over_ntiles or a dataframe that is created otherwise with similar layout as the output of these functions (see ?prepare_scores_and_ntiles and ?aggregate_over_ntiles for layout details)',    
'String. Evaluation type of interest. Possible values: "compare_models","compare_datasets", "compare_targetclasses","no_comparison". Default is NA, equivalent to "no_comparison".',
'String. Selected model when scope is "compare_datasets" or "compare_targetclasses" or "no_comparison". Needs to be identical to model descriptions as specified in model_labels (or models when model_labels is not specified). When scope is "compare_models", select_model_label can be used to take a subset of available models.',
'String. Selected dataset when scope is compare_models or compare_targetclasses or no_comparison. Needs to be identical to dataset descriptions as specified in dataset_labels (or datasets when dataset_labels is not specified). When scope is "compare_datasets", select_dataset_label can be used to take a subset of available datasets.',
'String. Selected target value when scope is compare_models or compare_datasets or no_comparison. Default is smallest value when select_smallest_targetclass=TRUE, otherwise first alphabetical value. When scope is "compare_targetclasses", select_targetclass can be used to take a subset of available target classes.',
'Boolean. Select the target value with the smallest number of cases in dataset as group of interest. Default is True, hence the target value with the least observations is selected'
  )
)

kable(text_tbl) %>%
  kableExtra::kable_styling(full_width = T,font_size = 10) %>%
  kableExtra::row_spec(c(1),italic = T)
  


## ----pi,echo=TRUE-------------------------------------------------------------
#generate input data frame for all plots in modelplotr
plot_input <- plotting_scope(prepared_input = scores_and_ntiles)

## ----modelplotr_process, echo=FALSE, out.width = '100%'-----------------------
knitr::include_graphics("https://modelplot.github.io/img/modelplotr_process.png")

## ----custinput_2, echo=FALSE, out.width = '100%'------------------------------
# prepare data
text_tbl <- data.frame(
  column = c('model_label','dataset_label','y_true','prob_[tv1]','prob_[tv2]','...',
             'prob_[tvn]','ntl_[tv1]','ntl_[tv2]','...','ntl_[tvn]'),
  type = c('Factor','Factor','Factor','Decimal','Decimal','...','Decimal','Integer','Integerl','...','Integer'),
definition = c('Name of the model object','Datasets to include in the plot as factor levels','Target with actual values',
'Probability according to model for target value 1','Probability according to model for target value 2','...',
'Probability according to model for target value n','Ntile based on probability according to model for target value 1',
'Ntile based on probability according to model for target value 2','...','Ntile based on probability according to model for target value n')
)

kable(text_tbl) %>%
  kableExtra::row_spec(c(1),italic = T) %>%
  kableExtra::kable_styling(font_size = 10,bootstrap_options = "basic",
  latex_options = "basic", full_width = TRUE, position = "center") 
  


## ----custinput_3, echo=FALSE, out.width = '100%'------------------------------
# prepare data
text_tbl <- data.frame(
  column = c('model_label','dataset_label','target_class','ntile','neg','pos','tot','pct','negtot','postot','tottot','pcttot',
'cumneg','cumpos','cumtot','cumpct','gain','cumgain','gain_ref','gain_opt','lift','cumlift','cumlift_ref'),
  type = c('String','Factor','String or Integer','Integer','Integer','Integer','Integer','Decimal','Integer','Integer','Integer',
'Decimal','Integer','Integer','Integer','Integer','Decimal','Decimal','Decimal','Decimal','Decimal','Decimal','Decimal'),
definition = c('Name of the model object','Datasets to include in the plot as factor levels','Target classes to include in the plot','Ntile groups based on model probability for target class','Number of cases not belonging to target class in dataset in ntile','Number of cases belonging to target class in dataset in ntile','Total number of cases in dataset in ntile',
'Percentage of cases in dataset in ntile that belongs to target class (pos/tot)','Total number of cases not belonging to target class in dataset','Total number of cases belonging to target class in dataset','Total number of cases in dataset',
'Percentage of cases in dataset that belongs to target class (postot / tottot)','Cumulative number of cases not belonging to target class in dataset from ntile 1 up until ntile','Cumulative number of cases belonging to target class in dataset from ntile 1 up until ntile','Cumulative number of cases in dataset from ntile 1 up until ntile','Cumulative percentage of cases belonging to target class in dataset from ntile 1 up until ntile (cumpos/cumtot)','Gains value for dataset for ntile (pos/postot)',
'Cumulative gains value for dataset for ntile (cumpos/postot)','Lower reference for gains value for dataset for ntile (ntile/#ntiles)','Upper reference for gains value for dataset for ntile','Lift value for dataset for ntile (pct/pcttot)',
'Cumulative lift value for dataset for ntile ((cumpos/cumtot)/pcttot)','Reference value for Cumulative lift value (constant: 1)')
)

kable(text_tbl) %>%
  #kable_styling(full_width = T,font_size = 10) %>%
  kableExtra::kable_styling(font_size = 10,full_width = FALSE,position="left") %>%
  kableExtra::row_spec(c(1),italic = T)
  


## ----plot_cg,echo=TRUE, fig.width=7.2,fig.height=5----------------------------
plot_cumgains(data = plot_input)

## ----plot_cl_r_cr,echo=TRUE, fig.width=7.2,fig.height=5,eval=FALSE------------
#  #Cumulative lift
#  plot_cumlift(data = plot_input)
#  
#  #Response plot
#  plot_response(data = plot_input)
#  
#  #Cumulative response plot
#  plot_cumresponse(data = plot_input)

## ----decrease_ntile, echo=FALSE-----------------------------------------------
# prepare data
scores_and_ntiles2 <- prepare_scores_and_ntiles(datasets=list("train","test"),
                      dataset_labels = list("train data","test data"),
                      models = list("rf","mnl"),
                      model_labels = list("random forest","multinomial logit"),
                      target_column="has_td",
                      ntiles = 10)
plot_input <- plotting_scope(prepared_input = scores_and_ntiles2)

## ----plot_multi,echo=TRUE, fig.width=7.2,fig.height=5-------------------------
plot_multiplot(data = plot_input)

## ----increase_ntile, echo=FALSE-----------------------------------------------
plot_input <- plotting_scope(prepared_input = scores_and_ntiles)

## ----fin_params, echo=FALSE---------------------------------------------------
# prepare data
text_tbl <- data.frame(
  Parameter = c('fixed_costs','variable_costs_per_unit','profit_per_unit'),
  `Type and Description` = c(
'Numeric. Specifying the fixed costs related to a selection based on the model. These costs are constant and do not vary with selection size (ntiles).',    
'Numeric. Specifying the variable costs per selected unit for a selection based on the model. These costs vary with selection size (ntiles).',
'Numeric. Specifying the profit per unit in case the selected unit converts / responds positively.'
  )
)

kable(text_tbl) %>%
  kableExtra::kable_styling(full_width = T,font_size = 10) %>%
  kableExtra::row_spec(c(1),italic = T)
  


## ----plot_roi,echo=TRUE, fig.width=7.2,fig.height=5---------------------------
plot_roi(data = plot_input,fixed_costs = 1000,variable_costs_per_unit = 10,profit_per_unit = 50)

## ----plot_costrev_profit,echo=TRUE, fig.width=7.2,fig.height=5----------------

#Costs & Revenues plot, highlighted at max roi instead of max profit
plot_costsrevs(data = plot_input,fixed_costs = 1000,variable_costs_per_unit = 10,profit_per_unit = 50,highlight_ntile = "max_roi")

#Profit plot , highlighted at custom ntile instead of at max profit
plot_profit(data = plot_input,fixed_costs = 1000,variable_costs_per_unit = 10,profit_per_unit = 50,highlight_ntile = 5)


## ----plot_cgh,echo=TRUE, fig.width=7.2,fig.height=5---------------------------
plot_cumgains(data = plot_input,highlight_ntile = 20)

## ----plot_crhh,echo=TRUE, fig.width=7.2,fig.height=5--------------------------
plot_cumresponse(data = plot_input,highlight_ntile = 20,highlight_how = 'plot')

## ----customtext,echo=TRUE-----------------------------------------------------
my_text <- customize_plot_text(plot_input=plot_input)

#explore default values for the cumulative response plot:
my_text$cumresponse

#translate to Dutch
my_text$cumresponse$plottitle <- 'Cumulatieve Respons grafiek'
my_text$cumresponse$x_axis_label <- 'percentiel'
my_text$cumresponse$y_axis_label <- '% respons (cumulatief)'
my_text$cumresponse$response_refline_label <- 'respons in totale dataset'
my_text$cumresponse$annotationtext <- "Selecteren we percentiel 1 t/m &NTL volgens model &MDL in dataset &DS dan is het %% &YVAL gevallen in de selectie &VALUE."



## ----plotcustomtext,echo=TRUE, fig.width=7.2,fig.height=5---------------------
plot_cumresponse(data = plot_input,highlight_ntile = 20,custom_plot_text = my_text)

## ----plotcustomcolor,echo=TRUE, fig.width=7.2,fig.height=5--------------------
# set scope to compare models, to have several lines in the plots
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,scope = 'compare_models')

#customize plot line colors with RColorbrewer 
plot_cumgains(data = plot_input,custom_line_colors = RColorBrewer::brewer.pal(2,'Accent'))

#customize plot line colors with color names / hexadecimal codes 
plot_cumlift(data = plot_input,custom_line_colors = c('deepskyblue2','#FF0000'))


## ----saveplot,echo=TRUE,eval=FALSE, fig.width=7.2,fig.height=5----------------
#  
#  # save plot with defaults
#  plot_cumgains(data = plot_input,save_fig = TRUE)
#  
#  # save plot with custom filename
#  plot_cumlift(data = plot_input,save_fig_filename = 'plot123.png')
#  
#  # save plot with custom location
#  plot_cumresponse(data = plot_input,save_fig_filename = 'D:\\')
#  
#  # save plot with custom location and filename
#  plot_cumresponse(data = plot_input,save_fig_filename = 'D:\\plot123.png')
#  

## ----multinom, echo=TRUE, fig.width=7.2,fig.height=5--------------------------

# prepare data for training model for multinomial target td_type and train models
train_index =  base::sample(seq(1, nrow(bank_td)),size = 0.5*nrow(bank_td) ,replace = FALSE)
train = bank_td[train_index,c('td_type','duration','campaign','pdays','previous','euribor3m')]
test = bank_td[-train_index,c('td_type','duration','campaign','pdays','previous','euribor3m')]

# train a model
# setting caret cross validation, here tuned for speed (not accuracy!)
fitControl <- caret::trainControl(method = "cv",number = 2,classProbs=TRUE)
# mnl model using glmnet package
mnl = caret::train(td_type ~.,data = train, method = "glmnet",trControl = fitControl)

# prepare data
scores_and_ntiles <- prepare_scores_and_ntiles(datasets=list("train","test"),
                      dataset_labels = list("train data","test data"),
                      models = list("mnl"),
                      model_labels = list("multinomial logit"),
                      target_column="td_type",
                      ntiles = 100)

#generate input data frame for all plots, set scope at comparing target classes, leave out the 'no.td' class
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,scope = 'compare_targetclasses',
                             select_targetclass = c('td.type.A','td.type.B','td.type.C' ))

#plot 
plot_cumresponse(data = plot_input)


