final_project_feature_selection.Rmd.
I used the Boruta algorithm for its feature selection. The problem is that the features aren’t ordered by any statistical significance. For this reason, I decided to use the ordinary least squares model to help both confirm the features selected by the Boruta model, but also to determine the more significant features affecting PositiveChange. I’m hoping to select the common features between the two models, and use the ordering information from the OLS model.
I’ve decided against using the RFE model because Boruta is based on the RFE model.
library(Boruta)
library(mlbench)
# return the OLS stepwise model given the dataframe
<- function(df_run) {
run_stepwise set.seed(1234)
<- lm(PositiveChange ~ 1, data = df_run)
base.mod # Define the full model - including all predictors
<- lm(PositiveChange ~ ., data = df_run)
all.mod <- lm(PositiveChange ~ ., data = df_run)
ols_step <- step(base.mod, scope = list(lower = base.mod, upper = all.mod), direction = 'both', k = 2, trace = F)
ols_step
}
# translate the OLS model formula from obscure fieldnames into their more descriptive descriptions
<- function(formula) {
formula_to_vector_descriptions <- trimws(strsplit(as.character(formula[3]), "\\+")[[1]])
vector_untranslated <- lapply(vector_untranslated, fieldname_to_description)
vector_translation return(vector_translation)
}
# translate the OLS model summary using the descriptive descriptions instead of their obscure fieldnames
# sort by p-values descending and add stars based upon statistical significance
# p-value < 0.001 -> ***
# p-value < 0.01 -> **
# p-value < 0.05 -> *
# p-value < 0.1 -> .
# else -> -
<- function(lmodel) {
lm_to_summary_model <- summary(lmodel)
sum_ols <- as.data.frame(as.matrix(lapply(names(lmodel$coefficients), fieldname_to_description)))
df_ols_model <- unname(sum_ols$coefficients[, 4])
vect_pvals <- lapply(vect_pvals, function(x) {
vect_stars if(x > 0 & x < 0.001)
return("***")
else if(x >= 0.001 & x < 0.01)
return("**")
else if(x >= 0.01 & x < 0.05)
return("*")
else if(x >= 0.05 & x < 0.1)
return(".")
else if(x >= 0.1 & x < 1)
return("_")
})<- cbind(Variables = df_ols_model$V1, pvalues = vect_pvals, Stars = vect_stars)
df_ols_model as.data.frame(df_ols_model[order(vect_pvals), ])
}
# crossvalidate the OLS model
<- function(df_train, df_test) {
crossvalidate_ols <- df_test$PositiveChange
yTest <- df_test [ , !names(df_test) %in% "PositiveChange"]
XTest <- run_stepwise(df_train)
ols_validate <- ols_validate$coefficients
betaHatOLS_step <- colnames(ols_validate$model)[-1]
var_step <- cbind(rep(1, nrow(XTest)), XTest[,var_step])
XTestOLS_step <- as.matrix(XTestOLS_step)%*%as.matrix(betaHatOLS_step)
predOLS_step <- mean((predOLS_step - yTest)^2)
testMSEOLS_step # Report MSE OLS Stepwise feature selection
testMSEOLS_step<- predict(ols_validate, as.data.frame(XTest))
pred2 any(pred2 == predOLS_step)
}
# TODO: crossvalidate the Boruta model
<- function(df_train, df_test) {
crossvalidate_boruta <- df_test$PositiveChange
yTest <- df_test [ , !names(df_test) %in% "PositiveChange"]
XTest <- Boruta(PositiveChange ~ ., df_train)
boruta_validate }
set.seed(1234)
<- Boruta(PositiveChange ~ ., data = df_full_scaled)
boruta_full print(boruta_full)
## Boruta performed 99 iterations in 4.289078 mins.
## 76 attributes confirmed important: anhed2wks_2011, black2011,
## bmi_2011, closef_2011, closem_2011 and 71 more;
## 155 attributes confirmed unimportant: afraid_2011, alc45drinks_11,
## alcever_11, amphetev, artact_11 and 150 more;
## 35 tentative attributes left: alcfreq_11, cigct11, earnlvg_2011,
## email_2011, finanrespscale and 30 more;
#boruta_full <- TentativeRoughFix(boruta_full)
<- formula_to_vector_descriptions(getConfirmedFormula(boruta_full))
descriptions_full paste(descriptions_full, collapse = " + ")
## [1] "Current grade 02 + Hours sleep per night 02 + Stay with task 02 + Solve difficult tasks 02 + Orderly 02 + Do best 02 + Hurt someone 02 + Ever tried chew tobacco 02 + Ever drank? 02 + Ever tried marijuana 02 + Ever tried inhalants 02 + Partner 02 + Converse with adults 02 + Talk to teachers 02 + Ask questions 02 + Class discussions 02 + Joke with teachers 02 + Sexual intercourse 02 + Happy 02 + Interest in life 02 + Satisfied 02 + Contribution 02 + Community 02 + People are good 02 + Society works 02 + Responsibilities 02 + Warm relationships 02 + Challenged you 02 + Expressing ideas 02 + Global self concept 02 + Social initiative 02 + Grade in sch + Child age at assessment - months 02 + A1_1 how satisfied w/ life as a whole + B6a how good at responsibility + B6b how good at problem solving + B6c how good at money management + C1a how good at supervising comp + C1b how good at leading comp w/otrs + C1c how good at logic comp w/otrs + C1d how good at helping comp w/otrs + C1e how intelligent compared w/otrs + C1f how independent compared w/otrs + C1g how confident compared w/others + C1h how decisive compared w/others + C1j how well listen compared w/others + C1k how good at teaching compared w/otrs + D41 how close to father + D46 how close to mother + H15 wtr>2 wks depressed in past 12 mos + H16 wtr>2 wks no interest in life + H27 # of hours of sleep in 24-hr period + K1c how often others treat as stupid + K4 wtr ever sexually assaulted/raped + L1a religious preference + M1 frequency of happiness in last month + M2 freq of interest in life in last mo + M3 freq of feeling satisfied in last mo + M4 freq of feeling contrib to society + M5 freq of feeling belonging to communty + M6 freq of feeling society gettng better + M7 freq of feeling people basically good + M8 freq feelng way soc works makes sense + M9 freq feel managng daily responsibilty + M10 freq feeling trusting rels w/others + M11 freq of feeling challenged to grow + M12 freq feeling confident of own ideas + M13 freq of feeling liked personality + M14 freq of feeling life had direction + Mental health: non-spec psych distress + Body weight percentile status + Enrollment status + Completed education of mother + Year individual born 01 + black2011 + dob"
#print(boruta_full$finalDecision[boruta_full$finalDecision %in% c("Confirmed", "Tentative")])
<- gather(as.data.frame(boruta_full$ImpHistory), feature, measurement)
df_full_long <- df_full_long$feature
feature_full <- unlist(lapply(feature_full, fieldname_to_description))
feature_label_full <- unlist(lapply(df_full_long$measurement, function(x) { if(x == "-Inf") x = 0; return(x)}))
measurement_full <- data.frame(Feature = feature_full, FeatureLabel = feature_label_full, Measurement = measurement_full)
df_full_plotly plot_ly(df_full_plotly, x = ~FeatureLabel, y = ~Measurement, color = ~Feature, marker = list(size = 2), type = "box") %>%
layout(title="Box-and-whisker Plots Across All Features",
xaxis = list(title="Features"),
yaxis = list(title="Importance"),
showlegend=F)
set.seed(1234)
<- Boruta(PositiveChange ~ ., data = df_objective_scaled)
boruta_objective print(boruta_objective)
## Boruta performed 99 iterations in 4.879038 mins.
## 55 attributes confirmed important: alcever_11, anhed2wks_2011,
## black2011, bmi_2011, closef_2011 and 50 more;
## 95 attributes confirmed unimportant: amphetev, barbev, bingeeat11,
## BULLY02, danger_2011 and 90 more;
## 56 tentative attributes left: afraid_2011, alc45drinks_11, alcfreq_11,
## artact_11, asthma and 51 more;
#boruta_objective <- TentativeRoughFix(boruta_objective)
<- formula_to_vector_descriptions(getConfirmedFormula(boruta_objective))
descriptions_objective paste(descriptions_objective, collapse = " + ")
## [1] "Whether sample or nonsample + Race/ethnicity 02 + Religious comfort 02 + Spiritual? 02 + Current grade 02 + # Days ate: dairy 02 + Hours sleep per night 02 + Hurt someone 02 + Ever tried chew tobacco 02 + Ever drank? 02 + Ever tried marijuana 02 + Ever tried inhalants 02 + Partner 02 + Converse with adults 02 + Talk to teachers 02 + Ask questions 02 + Class discussions 02 + Joke with teachers 02 + Sexual intercourse 02 + Global self concept 02 + Social initiative 02 + Grade in sch + Child age at assessment - months 02 + A6 how often watched or read news + A9 wtr ever used internet + A10a wtr used internet for email + A10b wtr used internet for school + A10e wtr used internet for soc netwrking + B5a how much resonsiblty earng own livng + B5d how much responsiblty managing money + D11 how satisfied w/relationship + D41 how close to father + D46 how close to mother + H15 wtr>2 wks depressed in past 12 mos + H16 wtr>2 wks no interest in life + H27 # of hours of sleep in 24-hr period + H36 wtr drink alcohol-head + K1c how often others treat as stupid + K1e how often others treat as dishonest + K1k how often treated with less respect + K4 wtr ever sexually assaulted/raped + L1a religious preference + L3 freq of attend relig svcs last 12 mos + L4 wtr spiritual person + Responsibilities: financial + Mental health: worry + Mental health: social anxiety + Mental health: non-spec psych distress + Body weight percentile status + Enrollment status + Completed education of mother + Year individual born 01 + L8 importance of ethnic group identity + black2011 + dob"
#print(boruta_objective$finalDecision[boruta_objective$finalDecision %in% c("Confirmed", "Tentative")])
<- gather(as.data.frame(boruta_objective$ImpHistory), feature, measurement)
df_objective_long <- df_objective_long$feature
feature_objective <- unlist(lapply(feature_objective, fieldname_to_description))
feature_label_objective <- unlist(lapply(df_objective_long$measurement, function(x) { if(x == "-Inf") x = 0; return(x)}))
measurement_objective <- data.frame(Feature = feature_objective, FeatureLabel = feature_label_objective, Measurement = measurement_objective)
df_objective_plotly plot_ly(df_objective_plotly, x = ~FeatureLabel, y = ~Measurement, color = ~Feature, marker = list(size = 2), type = "box") %>%
layout(title="Box-and-whisker Plots Across Objective Features",
xaxis = list(title="Features"),
yaxis = list(title="Importance"),
showlegend=F)
<- run_stepwise(df_full_scaled)
ols_full <- lm_to_summary_model(ols_full)
df_ols_model_full_trans # get the shortlisted variable
# ols_confirmed_vars_full <- names(unlist(ols_full[[1]]))
# # remove the intercept
# ols_confirmed_vars_full <- ols_confirmed_vars_full[!ols_confirmed_vars_full %in% "(Intercept)"]
# ols_confirmed_vars_full <- unlist(lapply(ols_confirmed_vars_full, fieldname_to_description))
# print(ols_confirmed_vars_full)
# estimate variable importance
<- varImp(ols_full, scale=FALSE)
varImp_ols_full rownames(varImp_ols_full) <- unlist(lapply(rownames(varImp_ols_full), fieldname_to_description))
# summarize importance
print(varImp_ols_full)
## Overall
## M4 freq of feeling contrib to society 8.415381
## H14b how often felt hopeless in past mo 2.412283
## M11 freq of feeling challenged to grow 9.072687
## A9 wtr ever used internet 2.743320
## M12 freq feeling confident of own ideas 5.863598
## M5 freq of feeling belonging to communty 7.864477
## M9 freq feel managng daily responsibilty 5.003928
## M13 freq of feeling liked personality 4.157285
## M7 freq of feeling people basically good 3.299064
## H36 wtr drink alcohol-head 2.573680
## A10d wtr used internet for games 3.041384
## K3c pct of close friends unemp & looking 2.282857
## Volunteer? 02 1.948493
## H27 # of hours of sleep in 24-hr period 3.093330
## M14 freq of feeling life had direction 4.367528
## A11b wtr in social action groups 2.537139
## M10 freq feeling trusting rels w/others 4.252902
## Orderly 02 3.479624
## Ever tried chew tobacco 02 4.132048
## Satisfied 02 3.782453
## Ask questions 02 1.955088
## L6 hispanicity 2.527322
## Mental health: social anxiety 1.543961
## E71a ever in armed forces 2.422663
## Earnings from work last year 3.104391
## # Friends who drink 02 2.373593
## H40f wtr ever taken tranquilizers 2.364790
## L8 importance of ethnic group identity 2.521809
## M8 freq feelng way soc works makes sense 2.266375
## Contribution 02 2.714439
## Finish things 02 2.139137
## H40b wtr ever taken amphetamines 2.493403
## H53f diagnosed with std or hiv 2.061405
## A6 how often watched or read news 1.789889
## Oth relative 02 2.726997
## C1g how confident compared w/others 2.986164
## D28a number of children 2.307582
## Highest education level 2.308413
## # Days pe class at sch 02 1.910717
## H28b frequency of binge eating 2.111068
## Grandparent 02 1.693460
## Interest in life 02 2.254645
## K3b2 pct of close friends in coll/grad 1.497890
## B5b how much responsiblty payng own rent 2.392240
## # Days ate: sweets 02 1.797915
## Self-rated health 02 1.606074
## Class discussions 02 1.988545
## D46 how close to mother 1.695361
## D41 how close to father 1.726173
## B5d how much responsiblty managing money 1.508523
## K3e pct of close friends married, etc. 2.232634
## K3a pct of close friends w/job not coll 2.163490
## H23a time unit for hvy phys activity--hw 1.820720
## H23a freq of hvy physical activity--hw 2.369867
## K5b age when (first) assaulted 4.925401
## K4 wtr ever sexually assaulted/raped 4.787028
## M3 freq of feeling satisfied in last mo 1.624327
## Child age at assessment - months 02 3.436641
## dob 3.888772
## Current grade 02 2.839005
## Marital/cohabitation status 1.936559
## Month individual born 01 1.755591
## H12 wtr ever had emotional/psych probs 1.829863
## A10c wtr used internet to shop 1.847349
## C1k how good at teaching compared w/otrs 1.825239
## Community groups 02 1.730840
## G20 wtr currently in vo/tech training 1.643408
## H23a how often do vigorous activities 1.611952
# crossvalidate between the training and testing data
crossvalidate_ols(df_full_train_scaled, df_full_test_scaled)
## [1] TRUE
<- run_stepwise(df_objective_scaled)
ols_objective <- lm_to_summary_model(ols_objective)
df_ols_model_objective_trans # get the shortlisted variable
# ols_confirmed_vars_objective <- names(unlist(ols_objective[[1]]))
# # remove the intercept
# ols_confirmed_vars_objective <- ols_confirmed_vars_objective[!ols_confirmed_vars_objective %in% "(Intercept)"]
# ols_confirmed_vars_objective <- unlist(lapply(ols_confirmed_vars_objective, fieldname_to_description))
# print(ols_confirmed_vars_objective)
# estimate variable importance
<- varImp(ols_objective, scale=FALSE)
varImp_ols_objective rownames(varImp_ols_objective) <- unlist(lapply(rownames(varImp_ols_objective), fieldname_to_description))
# summarize importance
print(varImp_ols_objective)
## Overall
## K4 wtr ever sexually assaulted/raped 3.923239
## Mental health: non-spec psych distress 5.894814
## L4 wtr spiritual person 2.977607
## A10b wtr used internet for school 2.987919
## Mental health: worry 4.340180
## D11 how satisfied w/relationship 3.573606
## A7 how often read for pleasure 2.957345
## D41 how close to father 2.303565
## Volunteer? 02 2.168514
## A10e wtr used internet for soc netwrking 2.377067
## K1e how often others treat as dishonest 2.366514
## Mental health: social anxiety 2.785413
## Earnings from work last year 3.319378
## B5b how much responsiblty payng own rent 2.240868
## A10d wtr used internet for games 1.977201
## K3c pct of close friends unemp & looking 3.287942
## K15 how often drove when drunk or high 2.051966
## H40g wtr ever taken steroids 1.417393
## K7b age when (first) attacked 2.440831
## L7 race mention #1 3.108986
## # Days pe class at sch 02 1.865996
## G19 wtr non-academic training 1.686459
## H15 wtr>2 wks depressed in past 12 mos 2.035415
## H17 wtr had annual dr checkup last year 2.139712
## Coworker 02 1.968683
## H37 how often have drinks-hd 2.319189
## H50 wtr ever had sexual intercourse 2.710452
## Class discussions 02 3.268817
## # Days ate: meat 02 2.934206
## # Days ate: dairy 02 1.534244
## Grandparent 02 1.580340
## K7a # times physically attacked 2.258085
## Picked on you 02 2.170417
## Community groups 02 1.973635
## L8 importance of ethnic group identity 1.807781
## Highest education level 2.958225
## H23a how often do vigorous activities 2.419305
## H27 # of hours of sleep in 24-hr period 2.588787
## L3 freq of attend relig svcs last 12 mos 2.101606
## Damaged school 02 2.663003
## Stopped by police 02 2.012515
## A10c wtr used internet to shop 1.625131
## D8 wtr romantic relationship now 3.241477
## Marital/cohabitation status 2.839317
## K1k how often treated with less respect 1.791721
## H40a wtr ever taken diet pills 1.561723
## A6 how often watched or read news 1.542058
## L6 hispanicity 1.600330
## A9 wtr ever used internet 1.850545
## K3n pct of close friends who use drugs 1.431976
# crossvalidate between the training and testing data
crossvalidate_ols(df_objective_train_scaled, df_objective_test_scaled)
## [1] TRUE
As you will see below, the full featured dataset contains more data than the “objective” counterparts. Each dataset provides a different view into the question at hand.
# combine the two common features for full dataset
<- as.data.frame(df_ols_model_full_trans[is.element(set = df_ols_model_full_trans$Variables, el = as.matrix(descriptions_full)), ])
df_full_feauture_selection_combined
# combine the two common features for objective dataset
<- as.matrix(df_ols_model_objective_trans[is.element(set = df_ols_model_objective_trans$Variables, el = as.matrix(descriptions_objective)), ])
df_objective_feature_selection_combined
print("Combination of Boruta and OLS Full Datasets ")
## [1] "Combination of Boruta and OLS Full Datasets "
df_full_feauture_selection_combined
## Variables pvalues Stars
## 1 M11 freq of feeling challenged to grow 2.824861e-19 ***
## 5 M9 freq feel managng daily responsibilty 6.136332e-07 ***
## 8 M14 freq of feeling life had direction 1.324403e-05 ***
## 15 Child age at assessment - months 02 0.0006016748 ***
## 16 M7 freq of feeling people basically good 0.0009880329 ***
## 20 C1g how confident compared w/others 0.002861189 **
## 21 Current grade 02 0.004573647 **
## 22 A9 wtr ever used internet 0.006139365 **
## 33 # Friends who drink 02 0.01771506 *
## 44 H28b frequency of binge eating 0.03489698 *
## 47 Ask questions 02 0.05071937 .
## 48 Volunteer? 02 0.05150324 .
## 49 Marital/cohabitation status 0.05294758 .
## 52 H12 wtr ever had emotional/psych probs 0.06742727 .
## 54 H23a time unit for hvy phys activity--hw 0.06880686 .
## 58 Community groups 02 0.08364293 .
## 59 D41 how close to father 0.08447916 .
## 60 D46 how close to mother 0.09017097 .
## 62 G20 wtr currently in vo/tech training 0.1004642 _
## 63 M3 freq of feeling satisfied in last mo 0.1044722 _
## 64 H23a how often do vigorous activities 0.1071386 _
## 65 Self-rated health 02 0.1084242 _
## 66 Mental health: social anxiety 0.1227645 _
## 67 B5d how much responsiblty managing money 0.1315873 _
## 68 K3b2 pct of close friends in coll/grad 0.1343282 _
## 69 (Intercept) 1 _
## NA NULL NULL NULL
print("Combination of Boruta and OLS Objective Datasets ")
## [1] "Combination of Boruta and OLS Objective Datasets "
df_objective_feature_selection_combined
## Variables pvalues Stars
## 6 "K3c pct of close friends unemp & looking" 0.001027497 "**"
## 17 "H50 wtr ever had sexual intercourse" 0.006779113 "**"
## 24 "H37 how often have drinks-hd" 0.02048952 "*"
## 25 "D41 how close to father" 0.02135375 "*"
## 27 "B5b how much responsiblty payng own rent" 0.02514881 "*"
## 28 "Picked on you 02" 0.03009784 "*"
## 31 "L3 freq of attend relig svcs last 12 mos" 0.0357181 "*"
## 32 "K15 how often drove when drunk or high" 0.04030866 "*"
## 34 "Stopped by police 02" 0.04430541 "*"
## 36 "Community groups 02" 0.04856676 "*"
## 39 "A9 wtr ever used internet" 0.06438898 "."
## 40 "L8 importance of ethnic group identity" 0.07079747 "."
## 41 "K1k how often treated with less respect" 0.07333554 "."
## 43 "A10c wtr used internet to shop" 0.1042992 "_"
## 44 "L6 hispanicity" 0.1096903 "_"
## 46 "H40a wtr ever taken diet pills" 0.1185186 "_"
## 47 "A6 how often watched or read news" 0.1232248 "_"
## 48 "# Days ate: dairy 02" 0.1251347 "_"
## NA NULL NULL NULL