final_project_feature_selection.Rmd.

1 Feature Selection Introduction

I used the Boruta algorithm for its feature selection. The problem is that the features aren’t ordered by any statistical significance. For this reason, I decided to use the ordinary least squares model to help both confirm the features selected by the Boruta model, but also to determine the more significant features affecting PositiveChange. I’m hoping to select the common features between the two models, and use the ordering information from the OLS model.

I’ve decided against using the RFE model because Boruta is based on the RFE model.

library(Boruta)
library(mlbench)

# return the OLS stepwise model given the dataframe
run_stepwise <- function(df_run) {
  set.seed(1234)
  base.mod <- lm(PositiveChange ~ 1, data = df_run)
  # Define the full model - including all predictors
  all.mod <- lm(PositiveChange ~ ., data = df_run)
  ols_step <- lm(PositiveChange ~ ., data = df_run)
  ols_step <- step(base.mod, scope = list(lower = base.mod, upper = all.mod), direction = 'both', k = 2, trace = F)
}

# translate the OLS model formula from obscure fieldnames into their more descriptive descriptions
formula_to_vector_descriptions <- function(formula) {
  vector_untranslated <- trimws(strsplit(as.character(formula[3]), "\\+")[[1]])
  vector_translation <- lapply(vector_untranslated, fieldname_to_description)
  return(vector_translation)
}

# translate the OLS model summary using the descriptive descriptions instead of their obscure fieldnames
# sort by p-values descending and add stars based upon statistical significance
# p-value < 0.001   -> ***
# p-value < 0.01    -> **
# p-value < 0.05    -> *
# p-value < 0.1     -> .
# else              -> -
lm_to_summary_model <- function(lmodel) {
  sum_ols <- summary(lmodel)
  df_ols_model <- as.data.frame(as.matrix(lapply(names(lmodel$coefficients), fieldname_to_description)))
  vect_pvals <- unname(sum_ols$coefficients[, 4])
  vect_stars <- lapply(vect_pvals, function(x) {
      if(x > 0 & x < 0.001)
        return("***")
      else if(x >= 0.001 & x < 0.01)
        return("**")
      else if(x >= 0.01 & x < 0.05)
        return("*")
      else if(x >= 0.05 & x < 0.1)
        return(".")
      else if(x >= 0.1 & x < 1)
        return("_")
    })
  df_ols_model <- cbind(Variables = df_ols_model$V1, pvalues = vect_pvals, Stars = vect_stars)
  as.data.frame(df_ols_model[order(vect_pvals), ])
}

# crossvalidate the OLS model
crossvalidate_ols <- function(df_train, df_test) {
  yTest <- df_test$PositiveChange
  XTest <- df_test  [ , !names(df_test) %in% "PositiveChange"]
  ols_validate <- run_stepwise(df_train)
  betaHatOLS_step <- ols_validate$coefficients
  var_step <- colnames(ols_validate$model)[-1]
  XTestOLS_step <- cbind(rep(1, nrow(XTest)), XTest[,var_step])
  predOLS_step <- as.matrix(XTestOLS_step)%*%as.matrix(betaHatOLS_step)
  testMSEOLS_step <- mean((predOLS_step - yTest)^2)
  # Report MSE OLS Stepwise feature selection
  testMSEOLS_step
  pred2 <- predict(ols_validate, as.data.frame(XTest))
  any(pred2 == predOLS_step)
}

# TODO: crossvalidate the Boruta model
crossvalidate_boruta <- function(df_train, df_test) {
  yTest <- df_test$PositiveChange
  XTest <- df_test  [ , !names(df_test) %in% "PositiveChange"]
  boruta_validate <- Boruta(PositiveChange ~ ., df_train)
}

1.1 Boruta - Full Dataset

set.seed(1234)
boruta_full <- Boruta(PositiveChange ~ ., data = df_full_scaled)
print(boruta_full)
## Boruta performed 99 iterations in 4.289078 mins.
##  76 attributes confirmed important: anhed2wks_2011, black2011,
## bmi_2011, closef_2011, closem_2011 and 71 more;
##  155 attributes confirmed unimportant: afraid_2011, alc45drinks_11,
## alcever_11, amphetev, artact_11 and 150 more;
##  35 tentative attributes left: alcfreq_11, cigct11, earnlvg_2011,
## email_2011, finanrespscale and 30 more;
#boruta_full <- TentativeRoughFix(boruta_full)
descriptions_full <- formula_to_vector_descriptions(getConfirmedFormula(boruta_full))
paste(descriptions_full, collapse = " + ")
## [1] "Current grade 02 + Hours sleep per night 02 + Stay with task 02 + Solve difficult tasks 02 + Orderly 02 + Do best 02 + Hurt someone 02 + Ever tried chew tobacco 02 + Ever drank? 02 + Ever tried marijuana 02 + Ever tried inhalants 02 + Partner 02 + Converse with adults 02 + Talk to teachers 02 + Ask questions 02 + Class discussions 02 + Joke with teachers 02 + Sexual intercourse 02 + Happy 02 + Interest in life 02 + Satisfied 02 + Contribution 02 + Community 02 + People are good 02 + Society works 02 + Responsibilities 02 + Warm relationships 02 + Challenged you 02 + Expressing ideas 02 + Global self concept 02 + Social initiative 02 + Grade in sch + Child age at assessment - months 02 + A1_1 how satisfied w/ life as a whole + B6a how good at responsibility + B6b how good at problem solving + B6c how good at money management + C1a how good at supervising comp + C1b how good at leading comp w/otrs + C1c how good at logic comp w/otrs + C1d how good at helping comp w/otrs + C1e how intelligent compared w/otrs + C1f how independent compared w/otrs + C1g how confident compared w/others + C1h how decisive compared w/others + C1j how well listen compared w/others + C1k how good at teaching compared w/otrs + D41  how close to father + D46  how close to mother + H15 wtr>2 wks depressed in past 12 mos + H16 wtr>2 wks no interest in life + H27 # of hours of sleep in 24-hr period + K1c how often others treat as stupid + K4 wtr ever sexually assaulted/raped + L1a religious preference + M1 frequency of happiness in last month + M2 freq of interest in life in last mo + M3 freq of feeling satisfied in last mo + M4 freq of feeling contrib to society + M5 freq of feeling belonging to communty + M6 freq of feeling society gettng better + M7 freq of feeling people basically good + M8 freq feelng way soc works makes sense + M9 freq feel managng daily responsibilty + M10 freq feeling trusting rels w/others + M11 freq of feeling challenged to grow + M12 freq feeling confident of own ideas + M13 freq of feeling liked personality + M14 freq of feeling life had direction + Mental health:  non-spec psych distress + Body weight percentile status + Enrollment status + Completed education of mother + Year individual born                  01 + black2011 + dob"
#print(boruta_full$finalDecision[boruta_full$finalDecision %in% c("Confirmed", "Tentative")])

df_full_long <- gather(as.data.frame(boruta_full$ImpHistory), feature, measurement)
feature_full <- df_full_long$feature
feature_label_full <- unlist(lapply(feature_full, fieldname_to_description))
measurement_full <- unlist(lapply(df_full_long$measurement, function(x) { if(x == "-Inf") x = 0; return(x)}))
df_full_plotly <- data.frame(Feature = feature_full, FeatureLabel = feature_label_full, Measurement = measurement_full)
plot_ly(df_full_plotly, x = ~FeatureLabel, y = ~Measurement, color = ~Feature, marker = list(size = 2), type = "box") %>%
  layout(title="Box-and-whisker Plots Across All Features",
           xaxis = list(title="Features"),
           yaxis = list(title="Importance"),
           showlegend=F)

1.2 Boruta - Objective Dataset

set.seed(1234)
boruta_objective <- Boruta(PositiveChange ~ ., data = df_objective_scaled)
print(boruta_objective)
## Boruta performed 99 iterations in 4.879038 mins.
##  55 attributes confirmed important: alcever_11, anhed2wks_2011,
## black2011, bmi_2011, closef_2011 and 50 more;
##  95 attributes confirmed unimportant: amphetev, barbev, bingeeat11,
## BULLY02, danger_2011 and 90 more;
##  56 tentative attributes left: afraid_2011, alc45drinks_11, alcfreq_11,
## artact_11, asthma and 51 more;
#boruta_objective <- TentativeRoughFix(boruta_objective)
descriptions_objective <- formula_to_vector_descriptions(getConfirmedFormula(boruta_objective))
paste(descriptions_objective, collapse = " + ")
## [1] "Whether sample or nonsample + Race/ethnicity 02 + Religious comfort 02 + Spiritual? 02 + Current grade 02 + # Days ate: dairy 02 + Hours sleep per night 02 + Hurt someone 02 + Ever tried chew tobacco 02 + Ever drank? 02 + Ever tried marijuana 02 + Ever tried inhalants 02 + Partner 02 + Converse with adults 02 + Talk to teachers 02 + Ask questions 02 + Class discussions 02 + Joke with teachers 02 + Sexual intercourse 02 + Global self concept 02 + Social initiative 02 + Grade in sch + Child age at assessment - months 02 + A6 how often watched or read news + A9 wtr ever used internet + A10a wtr used internet for email + A10b wtr used internet for school + A10e wtr used internet for soc netwrking + B5a how much resonsiblty earng own livng + B5d how much responsiblty managing money + D11 how satisfied w/relationship + D41  how close to father + D46  how close to mother + H15 wtr>2 wks depressed in past 12 mos + H16 wtr>2 wks no interest in life + H27 # of hours of sleep in 24-hr period + H36 wtr drink alcohol-head + K1c how often others treat as stupid + K1e how often others treat as dishonest + K1k how often treated with less respect + K4 wtr ever sexually assaulted/raped + L1a religious preference + L3 freq of attend relig svcs last 12 mos + L4 wtr spiritual person + Responsibilities:  financial + Mental health:  worry + Mental health:  social anxiety + Mental health:  non-spec psych distress + Body weight percentile status + Enrollment status + Completed education of mother + Year individual born                  01 + L8 importance of ethnic group identity + black2011 + dob"
#print(boruta_objective$finalDecision[boruta_objective$finalDecision %in% c("Confirmed", "Tentative")])

df_objective_long <- gather(as.data.frame(boruta_objective$ImpHistory), feature, measurement)
feature_objective <- df_objective_long$feature
feature_label_objective <- unlist(lapply(feature_objective, fieldname_to_description))
measurement_objective <- unlist(lapply(df_objective_long$measurement, function(x) { if(x == "-Inf") x = 0; return(x)}))
df_objective_plotly <- data.frame(Feature = feature_objective, FeatureLabel = feature_label_objective, Measurement = measurement_objective)
plot_ly(df_objective_plotly, x = ~FeatureLabel, y = ~Measurement, color = ~Feature, marker = list(size = 2), type = "box") %>%
  layout(title="Box-and-whisker Plots Across Objective Features",
           xaxis = list(title="Features"),
           yaxis = list(title="Importance"),
           showlegend=F)

1.3 Stepwise Feature Selection - Full Dataset

ols_full <- run_stepwise(df_full_scaled)
df_ols_model_full_trans <- lm_to_summary_model(ols_full)
# get the shortlisted variable
# ols_confirmed_vars_full <- names(unlist(ols_full[[1]]))
# # remove the intercept
# ols_confirmed_vars_full <- ols_confirmed_vars_full[!ols_confirmed_vars_full %in% "(Intercept)"]
# ols_confirmed_vars_full <- unlist(lapply(ols_confirmed_vars_full, fieldname_to_description))
# print(ols_confirmed_vars_full)
# estimate variable importance
varImp_ols_full <- varImp(ols_full, scale=FALSE)
rownames(varImp_ols_full) <- unlist(lapply(rownames(varImp_ols_full), fieldname_to_description))
# summarize importance
print(varImp_ols_full)
##                                           Overall
## M4 freq of feeling contrib to society    8.415381
## H14b how often felt hopeless in past mo  2.412283
## M11 freq of feeling challenged to grow   9.072687
## A9 wtr ever used internet                2.743320
## M12 freq feeling confident of own ideas  5.863598
## M5 freq of feeling belonging to communty 7.864477
## M9 freq feel managng daily responsibilty 5.003928
## M13 freq of feeling liked personality    4.157285
## M7 freq of feeling people basically good 3.299064
## H36 wtr drink alcohol-head               2.573680
## A10d wtr used internet for games         3.041384
## K3c pct of close friends unemp & looking 2.282857
## Volunteer? 02                            1.948493
## H27 # of hours of sleep in 24-hr period  3.093330
## M14 freq of feeling life had direction   4.367528
## A11b wtr in social action groups         2.537139
## M10 freq feeling trusting rels w/others  4.252902
## Orderly 02                               3.479624
## Ever tried chew tobacco 02               4.132048
## Satisfied 02                             3.782453
## Ask questions 02                         1.955088
## L6 hispanicity                           2.527322
## Mental health:  social anxiety           1.543961
## E71a ever in armed forces                2.422663
## Earnings from work last year             3.104391
## # Friends who drink 02                   2.373593
## H40f wtr ever taken tranquilizers        2.364790
## L8 importance of ethnic group identity   2.521809
## M8 freq feelng way soc works makes sense 2.266375
## Contribution 02                          2.714439
## Finish things 02                         2.139137
## H40b wtr ever taken amphetamines         2.493403
## H53f diagnosed with std or hiv           2.061405
## A6 how often watched or read news        1.789889
## Oth relative 02                          2.726997
## C1g how confident compared w/others      2.986164
## D28a number of children                  2.307582
## Highest education level                  2.308413
## # Days pe class at sch 02                1.910717
## H28b frequency of binge eating           2.111068
## Grandparent 02                           1.693460
## Interest in life 02                      2.254645
## K3b2 pct of close friends in coll/grad   1.497890
## B5b how much responsiblty payng own rent 2.392240
## # Days ate: sweets 02                    1.797915
## Self-rated health 02                     1.606074
## Class discussions 02                     1.988545
## D46  how close to mother                 1.695361
## D41  how close to father                 1.726173
## B5d how much responsiblty managing money 1.508523
## K3e pct of close friends married, etc.   2.232634
## K3a pct of close friends w/job not coll  2.163490
## H23a time unit for hvy phys activity--hw 1.820720
## H23a freq of hvy physical activity--hw   2.369867
## K5b age when (first) assaulted           4.925401
## K4 wtr ever sexually assaulted/raped     4.787028
## M3 freq of feeling satisfied in last mo  1.624327
## Child age at assessment - months 02      3.436641
## dob                                      3.888772
## Current grade 02                         2.839005
## Marital/cohabitation status              1.936559
## Month individual born                 01 1.755591
## H12 wtr ever had emotional/psych probs   1.829863
## A10c wtr used internet to shop           1.847349
## C1k how good at teaching compared w/otrs 1.825239
## Community groups  02                     1.730840
## G20 wtr currently in vo/tech training    1.643408
## H23a how often do vigorous activities    1.611952
# crossvalidate between the training and testing data
crossvalidate_ols(df_full_train_scaled, df_full_test_scaled)
## [1] TRUE

1.4 Stepwise Feature Selection - Objective Dataset

ols_objective <- run_stepwise(df_objective_scaled)
df_ols_model_objective_trans <- lm_to_summary_model(ols_objective)
# get the shortlisted variable
# ols_confirmed_vars_objective <- names(unlist(ols_objective[[1]]))
# # remove the intercept
# ols_confirmed_vars_objective <- ols_confirmed_vars_objective[!ols_confirmed_vars_objective %in% "(Intercept)"]
# ols_confirmed_vars_objective <- unlist(lapply(ols_confirmed_vars_objective, fieldname_to_description))
# print(ols_confirmed_vars_objective)
# estimate variable importance
varImp_ols_objective <- varImp(ols_objective, scale=FALSE)
rownames(varImp_ols_objective) <- unlist(lapply(rownames(varImp_ols_objective), fieldname_to_description))
# summarize importance
print(varImp_ols_objective)
##                                           Overall
## K4 wtr ever sexually assaulted/raped     3.923239
## Mental health:  non-spec psych distress  5.894814
## L4 wtr spiritual person                  2.977607
## A10b wtr used internet for school        2.987919
## Mental health:  worry                    4.340180
## D11 how satisfied w/relationship         3.573606
## A7 how often read for pleasure           2.957345
## D41  how close to father                 2.303565
## Volunteer? 02                            2.168514
## A10e wtr used internet for soc netwrking 2.377067
## K1e how often others treat as dishonest  2.366514
## Mental health:  social anxiety           2.785413
## Earnings from work last year             3.319378
## B5b how much responsiblty payng own rent 2.240868
## A10d wtr used internet for games         1.977201
## K3c pct of close friends unemp & looking 3.287942
## K15 how often drove when drunk or high   2.051966
## H40g wtr ever taken steroids             1.417393
## K7b age when (first) attacked            2.440831
## L7 race mention #1                       3.108986
## # Days pe class at sch 02                1.865996
## G19 wtr non-academic training            1.686459
## H15 wtr>2 wks depressed in past 12 mos   2.035415
## H17 wtr had annual dr checkup last year  2.139712
## Coworker 02                              1.968683
## H37 how often have drinks-hd             2.319189
## H50 wtr ever had sexual intercourse      2.710452
## Class discussions 02                     3.268817
## # Days ate: meat 02                      2.934206
## # Days ate: dairy 02                     1.534244
## Grandparent 02                           1.580340
## K7a # times physically attacked          2.258085
## Picked on you 02                         2.170417
## Community groups  02                     1.973635
## L8 importance of ethnic group identity   1.807781
## Highest education level                  2.958225
## H23a how often do vigorous activities    2.419305
## H27 # of hours of sleep in 24-hr period  2.588787
## L3 freq of attend relig svcs last 12 mos 2.101606
## Damaged school 02                        2.663003
## Stopped by police 02                     2.012515
## A10c wtr used internet to shop           1.625131
## D8 wtr romantic relationship now         3.241477
## Marital/cohabitation status              2.839317
## K1k how often treated with less respect  1.791721
## H40a wtr ever taken diet pills           1.561723
## A6 how often watched or read news        1.542058
## L6 hispanicity                           1.600330
## A9 wtr ever used internet                1.850545
## K3n pct of close friends who use drugs   1.431976
# crossvalidate between the training and testing data
crossvalidate_ols(df_objective_train_scaled, df_objective_test_scaled)
## [1] TRUE

2 Summary

As you will see below, the full featured dataset contains more data than the “objective” counterparts. Each dataset provides a different view into the question at hand.

# combine the two common features for full dataset
df_full_feauture_selection_combined <- as.data.frame(df_ols_model_full_trans[is.element(set = df_ols_model_full_trans$Variables, el = as.matrix(descriptions_full)), ])

# combine the two common features for objective dataset
df_objective_feature_selection_combined <- as.matrix(df_ols_model_objective_trans[is.element(set = df_ols_model_objective_trans$Variables, el = as.matrix(descriptions_objective)), ])

print("Combination of Boruta and OLS Full Datasets ")
## [1] "Combination of Boruta and OLS Full Datasets "
df_full_feauture_selection_combined
##                                   Variables      pvalues Stars
## 1    M11 freq of feeling challenged to grow 2.824861e-19   ***
## 5  M9 freq feel managng daily responsibilty 6.136332e-07   ***
## 8    M14 freq of feeling life had direction 1.324403e-05   ***
## 15      Child age at assessment - months 02 0.0006016748   ***
## 16 M7 freq of feeling people basically good 0.0009880329   ***
## 20      C1g how confident compared w/others  0.002861189    **
## 21                         Current grade 02  0.004573647    **
## 22                A9 wtr ever used internet  0.006139365    **
## 33                   # Friends who drink 02   0.01771506     *
## 44           H28b frequency of binge eating   0.03489698     *
## 47                         Ask questions 02   0.05071937     .
## 48                            Volunteer? 02   0.05150324     .
## 49              Marital/cohabitation status   0.05294758     .
## 52   H12 wtr ever had emotional/psych probs   0.06742727     .
## 54 H23a time unit for hvy phys activity--hw   0.06880686     .
## 58                     Community groups  02   0.08364293     .
## 59                 D41  how close to father   0.08447916     .
## 60                 D46  how close to mother   0.09017097     .
## 62    G20 wtr currently in vo/tech training    0.1004642     _
## 63  M3 freq of feeling satisfied in last mo    0.1044722     _
## 64    H23a how often do vigorous activities    0.1071386     _
## 65                     Self-rated health 02    0.1084242     _
## 66           Mental health:  social anxiety    0.1227645     _
## 67 B5d how much responsiblty managing money    0.1315873     _
## 68   K3b2 pct of close friends in coll/grad    0.1343282     _
## 69                              (Intercept)            1     _
## NA                                     NULL         NULL  NULL
print("Combination of Boruta and OLS Objective Datasets ")
## [1] "Combination of Boruta and OLS Objective Datasets "
df_objective_feature_selection_combined
##    Variables                                  pvalues     Stars
## 6  "K3c pct of close friends unemp & looking" 0.001027497 "**" 
## 17 "H50 wtr ever had sexual intercourse"      0.006779113 "**" 
## 24 "H37 how often have drinks-hd"             0.02048952  "*"  
## 25 "D41  how close to father"                 0.02135375  "*"  
## 27 "B5b how much responsiblty payng own rent" 0.02514881  "*"  
## 28 "Picked on you 02"                         0.03009784  "*"  
## 31 "L3 freq of attend relig svcs last 12 mos" 0.0357181   "*"  
## 32 "K15 how often drove when drunk or high"   0.04030866  "*"  
## 34 "Stopped by police 02"                     0.04430541  "*"  
## 36 "Community groups  02"                     0.04856676  "*"  
## 39 "A9 wtr ever used internet"                0.06438898  "."  
## 40 "L8 importance of ethnic group identity"   0.07079747  "."  
## 41 "K1k how often treated with less respect"  0.07333554  "."  
## 43 "A10c wtr used internet to shop"           0.1042992   "_"  
## 44 "L6 hispanicity"                           0.1096903   "_"  
## 46 "H40a wtr ever taken diet pills"           0.1185186   "_"  
## 47 "A6 how often watched or read news"        0.1232248   "_"  
## 48 "# Days ate: dairy 02"                     0.1251347   "_"  
## NA NULL                                       NULL        NULL