Entrenando dos modelos para predecir ingresos

El objetivo de esta práctica es que puedan entrenar dos modelos con caret y compararlos. Para ello, volveremos al problema de regresión original: queremos construir un modelo que permita predecir los ingresos de la ocupación principal (\(p21\)) de los ocupados en la semana de referencia encuestados en el II Trimestre de 2015.

Para ello, deberán entrenar una regresión lineal y un modelo de random forest y comparar sus performances:

Importar las librerías a utilizar

library(caret)
library(tidyverse)
library(rpart)

Importar y setear los datos correctamente


load('../data/EPH_2015_II.RData')

data$pp03i<-factor(data$pp03i, labels=c('1-SI', '2-No', '9-NS'))



data$intensi<-factor(data$intensi, labels=c('1-Sub_dem', '2-SO_no_dem', 
                                            '3-Ocup.pleno', '4-Sobreoc',
                                            '5-No trabajo', '9-NS'))

data$pp07a<-factor(data$pp07a, labels=c('0-NC',
                                        '1-Menos de un mes',
                                        '2-1 a 3 meses',
                                        '3-3 a 6 meses',
                                        '4-6 a 12 meses',
                                        '5-12 a 60 meses',
                                        '6-Más de 60 meses',
                                        '9-NS'))

df_imp <- data %>%
        filter(imp_inglab1==1) %>%
        select(-imp_inglab1)

df_train <- data %>%
        filter(imp_inglab1==0) %>%
        select(-imp_inglab1) %>%
        mutate(p21 = case_when(
                        p21==0 ~ 100,
                        TRUE ~ p21))

Hacer un train-test split

set.seed(6615)
tr_index <- createDataPartition(y=df_train$p21,
                                p=0.8,
                                list=FALSE)

train <- df_train[tr_index,]
test <- df_train[-tr_index,]

Entrenar la regresión lineal

lm_fit <- train(p21~., method='lm',
                data=train,
                trControl=trainControl(method='none'))

Setear la partición para el tuneo del reandom forest


set.seed(7412)
cv_index_final <- createFolds(y = train$p21,
                        k=5,
                        list=FALSE,
                        returnTrain=TRUE)

fitControl <- trainControl(
        index=cv_index_final, 
        method="cv",
        number=5,
        allowParallel=FALSE)

Generar la grilla de hiperparámetros

grid_rf <- expand.grid(mtry=c(21),
                    splitrule='variance',
                    min.node.size=c(1))

Entrenar el modelo

t0<-proc.time()
rf_fit_regresion <- train(p21 ~ ., 
                          data = train, 
                          method = "ranger", 
                          trControl = fitControl,
                          tuneGrid = grid_rf,
                          verbose = FALSE)

Realizar las predicciones en el test-set y comparar

eval_regression <- function(model, test_set, y){
        preds <- predict(model, test_set)
        
        metrics <- postResample(preds, y) 
        return(metrics)
}


models <- list(lm = lm_fit,
               rf = rf_fit_regresion)


model_metrics <- models %>%
        map(eval_regression, test, test$p21)
prediction from a rank-deficient fit may be misleadingForest grown in ranger version <0.11.5, converting ...Error in predict.ranger.forest(forest, data, predict.all, num.trees, type,  : 
  Error: One or more independent variables not found in data.

Comparar en un scatter_plot las predicciones de cada modelo con los valores reales

rr model_preds <- models %>% map(predict, test) %>% as_tibble() %>% mutate(y = test$p21)

prediction from a rank-deficient fit may be misleading

rr ggplot(model_preds) + geom_point(aes(x=lm, y=y, color=‘Modelo Lineal’)) + geom_point(aes(x=rf, y=y, color=‘Random Forest’)) + labs(x = , y = , color=‘Modelos’)

LS0tCnRpdGxlOiAiUHLDoWN0aWNhIGluZGVwZW5kaWVudGUgLSBTT0xVQ0lPTiIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMgRW50cmVuYW5kbyBkb3MgbW9kZWxvcyBwYXJhIHByZWRlY2lyIGluZ3Jlc29zCgpFbCBvYmpldGl2byBkZSBlc3RhIHByw6FjdGljYSBlcyBxdWUgcHVlZGFuIGVudHJlbmFyIGRvcyBtb2RlbG9zIGNvbiBgY2FyZXRgIHkgY29tcGFyYXJsb3MuIFBhcmEgZWxsbywgdm9sdmVyZW1vcyBhbCBwcm9ibGVtYSBkZSByZWdyZXNpw7NuIG9yaWdpbmFsOiBxdWVyZW1vcyBjb25zdHJ1aXIgdW4gbW9kZWxvIHF1ZSBwZXJtaXRhIHByZWRlY2lyIGxvcyBpbmdyZXNvcyBkZSBsYSBvY3VwYWNpw7NuIHByaW5jaXBhbCAoJHAyMSQpIGRlIGxvcyBvY3VwYWRvcyBlbiBsYSBzZW1hbmEgZGUgcmVmZXJlbmNpYSBlbmN1ZXN0YWRvcyBlbiBlbCBJSSBUcmltZXN0cmUgZGUgMjAxNS4KClBhcmEgZWxsbywgZGViZXLDoW4gZW50cmVuYXIgdW5hIHJlZ3Jlc2nDs24gbGluZWFsIHkgdW4gbW9kZWxvIGRlIHJhbmRvbSBmb3Jlc3QgeSBjb21wYXJhciBzdXMgcGVyZm9ybWFuY2VzOgoKCiMjIyBJbXBvcnRhciBsYXMgbGlicmVyw61hcyBhIHV0aWxpemFyCgpgYGB7ciwgbWVzc2FnZT1GQUxTRX0KbGlicmFyeShjYXJldCkKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkocnBhcnQpCmBgYAoKCiMjIyBJbXBvcnRhciB5IHNldGVhciBsb3MgZGF0b3MgY29ycmVjdGFtZW50ZQoKYGBge3J9Cgpsb2FkKCcuLi9kYXRhL0VQSF8yMDE1X0lJLlJEYXRhJykKCmRhdGEkcHAwM2k8LWZhY3RvcihkYXRhJHBwMDNpLCBsYWJlbHM9YygnMS1TSScsICcyLU5vJywgJzktTlMnKSkKCgoKZGF0YSRpbnRlbnNpPC1mYWN0b3IoZGF0YSRpbnRlbnNpLCBsYWJlbHM9YygnMS1TdWJfZGVtJywgJzItU09fbm9fZGVtJywgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJzMtT2N1cC5wbGVubycsICc0LVNvYnJlb2MnLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICc1LU5vIHRyYWJham8nLCAnOS1OUycpKQoKZGF0YSRwcDA3YTwtZmFjdG9yKGRhdGEkcHAwN2EsIGxhYmVscz1jKCcwLU5DJywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICcxLU1lbm9zIGRlIHVuIG1lcycsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnMi0xIGEgMyBtZXNlcycsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnMy0zIGEgNiBtZXNlcycsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnNC02IGEgMTIgbWVzZXMnLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJzUtMTIgYSA2MCBtZXNlcycsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnNi1Nw6FzIGRlIDYwIG1lc2VzJywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICc5LU5TJykpCgpkZl9pbXAgPC0gZGF0YSAlPiUKICAgICAgICBmaWx0ZXIoaW1wX2luZ2xhYjE9PTEpICU+JQogICAgICAgIHNlbGVjdCgtaW1wX2luZ2xhYjEpCgpkZl90cmFpbiA8LSBkYXRhICU+JQogICAgICAgIGZpbHRlcihpbXBfaW5nbGFiMT09MCkgJT4lCiAgICAgICAgc2VsZWN0KC1pbXBfaW5nbGFiMSkgJT4lCiAgICAgICAgbXV0YXRlKHAyMSA9IGNhc2Vfd2hlbigKICAgICAgICAgICAgICAgICAgICAgICAgcDIxPT0wIH4gMTAwLAogICAgICAgICAgICAgICAgICAgICAgICBUUlVFIH4gcDIxKSkKCgpgYGAKCgoKIyMjIEhhY2VyIHVuIHRyYWluLXRlc3Qgc3BsaXQKCmBgYHtyfQpzZXQuc2VlZCg2NjE1KQp0cl9pbmRleCA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKHk9ZGZfdHJhaW4kcDIxLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHA9MC44LAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGxpc3Q9RkFMU0UpCgp0cmFpbiA8LSBkZl90cmFpblt0cl9pbmRleCxdCnRlc3QgPC0gZGZfdHJhaW5bLXRyX2luZGV4LF0KCmBgYAoKCiMjIEVudHJlbmFyIGxhIHJlZ3Jlc2nDs24gbGluZWFsCgpgYGB7cn0KbG1fZml0IDwtIHRyYWluKHAyMX4uLCBtZXRob2Q9J2xtJywKICAgICAgICAgICAgICAgIGRhdGE9dHJhaW4sCiAgICAgICAgICAgICAgICB0ckNvbnRyb2w9dHJhaW5Db250cm9sKG1ldGhvZD0nbm9uZScpKQpgYGAKCgojIyBTZXRlYXIgbGEgcGFydGljacOzbiBwYXJhIGVsIHR1bmVvIGRlbCByZWFuZG9tIGZvcmVzdAoKYGBge3J9CgpzZXQuc2VlZCg3NDEyKQpjdl9pbmRleF9maW5hbCA8LSBjcmVhdGVGb2xkcyh5ID0gdHJhaW4kcDIxLAogICAgICAgICAgICAgICAgICAgICAgICBrPTUsCiAgICAgICAgICAgICAgICAgICAgICAgIGxpc3Q9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgICAgIHJldHVyblRyYWluPVRSVUUpCgpmaXRDb250cm9sIDwtIHRyYWluQ29udHJvbCgKICAgICAgICBpbmRleD1jdl9pbmRleF9maW5hbCwgCiAgICAgICAgbWV0aG9kPSJjdiIsCiAgICAgICAgbnVtYmVyPTUsCiAgICAgICAgYWxsb3dQYXJhbGxlbD1GQUxTRSkKCmBgYAoKCiMjIyBHZW5lcmFyIGxhIGdyaWxsYSBkZSBoaXBlcnBhcsOhbWV0cm9zCgpgYGB7cn0KZ3JpZF9yZiA8LSBleHBhbmQuZ3JpZChtdHJ5PWMoMjEpLAogICAgICAgICAgICAgICAgICAgIHNwbGl0cnVsZT0ndmFyaWFuY2UnLAogICAgICAgICAgICAgICAgICAgIG1pbi5ub2RlLnNpemU9YygxKSkKYGBgCgoKIyMjIEVudHJlbmFyIGVsIG1vZGVsbwpgYGB7cn0KdDA8LXByb2MudGltZSgpCnJmX2ZpdF9yZWdyZXNpb24gPC0gdHJhaW4ocDIxIH4gLiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgZGF0YSA9IHRyYWluLCAKICAgICAgICAgICAgICAgICAgICAgICAgICBtZXRob2QgPSAicmFuZ2VyIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgdHJDb250cm9sID0gZml0Q29udHJvbCwKICAgICAgICAgICAgICAgICAgICAgICAgICB0dW5lR3JpZCA9IGdyaWRfcmYsCiAgICAgICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZSA9IEZBTFNFKQpwcm9jLnRpbWUoKSAtIHQwCgpzYXZlUkRTKHJmX2ZpdF9yZWdyZXNpb24sICcuLi9tb2RlbHMvcmZfZml0X3JlZ3Jlc2lvbi5SRFMnKQoKcmZfZml0X3JlZ3Jlc2lvbgpgYGAKCmBgYHtyLCBpbmNsdWRlPUZBTFNFfQpyZl9maXRfcmVncmVzaW9uIDwtIHJlYWRSRFMoJy4uLy4uL2NsYXNlXzMvbW9kZWxzL3JmX2ZpdF9yZWdyZXNpb24uUkRTJykKYGBgCgoKIyMjIFJlYWxpemFyIGxhcyBwcmVkaWNjaW9uZXMgZW4gZWwgdGVzdC1zZXQgeSBjb21wYXJhcgoKYGBge3J9CmV2YWxfcmVncmVzc2lvbiA8LSBmdW5jdGlvbihtb2RlbCwgdGVzdF9zZXQsIHkpewogICAgICAgIHByZWRzIDwtIHByZWRpY3QobW9kZWwsIHRlc3Rfc2V0KQogICAgICAgIAogICAgICAgIG1ldHJpY3MgPC0gcG9zdFJlc2FtcGxlKHByZWRzLCB5KSAKICAgICAgICByZXR1cm4obWV0cmljcykKfQoKCm1vZGVscyA8LSBsaXN0KGxtID0gbG1fZml0LAogICAgICAgICAgICAgICByZiA9IHJmX2ZpdF9yZWdyZXNpb24pCgoKbW9kZWxfbWV0cmljcyA8LSBtb2RlbHMgJT4lCiAgICAgICAgbWFwKGV2YWxfcmVncmVzc2lvbiwgdGVzdCwgdGVzdCRwMjEpCgptb2RlbF9tZXRyaWNzCgpgYGAKCgojIyMgQ29tcGFyYXIgZW4gdW4gc2NhdHRlcl9wbG90IGxhcyBwcmVkaWNjaW9uZXMgZGUgY2FkYSBtb2RlbG8gY29uIGxvcyB2YWxvcmVzIHJlYWxlcwoKYGBge3J9Cm1vZGVsX3ByZWRzIDwtIG1vZGVscyAlPiUKICAgICAgICBtYXAocHJlZGljdCwgdGVzdCkgJT4lCiAgICAgICAgYXNfdGliYmxlKCkgJT4lCiAgICAgICAgbXV0YXRlKHkgPSB0ZXN0JHAyMSkKCmdncGxvdChtb2RlbF9wcmVkcykgKyAKICAgICAgICBnZW9tX3BvaW50KGFlcyh4PWxtLCB5PXksIGNvbG9yPSdNb2RlbG8gTGluZWFsJykpICsgCiAgICAgICAgZ2VvbV9wb2ludChhZXMoeD1yZiwgeT15LCBjb2xvcj0nUmFuZG9tIEZvcmVzdCcpKSArCiAgICAgICAgbGFicyh4ID0gIlByZWRpY2Npb25lcyIsCiAgICAgICAgICAgICB5ID0gIk9ic2VydmFkb3MiLAogICAgICAgICAgICAgY29sb3I9J01vZGVsb3MnKQoKYGBgCg==