final_project_common.knit

library(haven)
library(tidyverse)
#library(stringr)
#library(tidyr)
#library(dplyr)
library(knitr)
library(kableExtra)
library(labelled)
library(DT)
library(plotly)
library(stats)
library(caret)
library(mltools)
library(data.table)
tbl_dta <- read_dta("http://kevinwurn.github.io/UMich-HS650/final_project_files/clean-childvarsJune2021.dta", encoding = "latin1")
filter_remove <- c("persid", "ER34001", "ER34101", "date_ciw2002", "mage2002", "age2002", "tasdate2011", "mage2011", "age2011", "eligsecL", "sample", "agecat_11", "parent11", "higheduc", "college", "employed", "sleeprecomm", "nutrit11_1", "nutrit11_2", "bingedrk3", "subusect", "goodhlth_11", "agedx_cat", "k6miss", "k6scale", "spd", "phq2", "ewb1_flr2011", "ewb2_flr2011", "ewb3_flr2011", "avgemot", "sumemot", "swb1_flr2011", "swb2_flr2011", "swb3_flr2011", "swb4_flr2011", "swb5_flr2011", "avgsoc", "sumsoc", "pwb1_flr2011", "pwb2_flr2011", "pwb3_flr2011", "pwb4_flr2011", "pwb5_flr2011", "pwb6_flr2011", "avgpsyc", "sumpsyc", "flrmiss11", "flr_fxing2011", "flourish_2011", "ewb1_lan2011", "ewb2_lan2011", "ewb3_lan2011", "swb1_lan2011", "swb2_lan2011", "swb3_lan2011", "swb4_lan2011", "swb5_lan2011", "pwb1_lan2011", "pwb2_lan2011", "pwb3_lan2011", "pwb4_lan2011", "pwb5_lan2011", "pwb6_lan2011", "lan_fxing2011", "lngsh_bin2011", "mod_bin2011", "posmh_2011", "ewb1_flr2002", "ewb2_flr2002", "ewb3_flr2002", "swb1_flr2002", "swb2_flr2002", "swb3_flr2002", "swb4_flr2002", "swb5_flr2002", "pwb1_flr2002",  "pwb2_flr2002", "pwb3_flr2002", "pwb4_flr2002", "flourishingfull_02", "flr_fxing2002", "flourish_2002", "ewb1_lan2002", "ewb2_lan2002", "ewb3_lan2002", "swb1_lan2002", "swb2_lan2002", "swb3_lan2002", "swb4_lan2002", "swb5_lan2002", "pwb1_lan2002", "pwb2_lan2002", "pwb3_lan2002", "pwb4_lan2002", "lan_fxing2002", "lngsh_bin2002", "mod_bin2002", "posmh_2002")
df_full <- tbl_dta[ , !names(tbl_dta) %in% filter_remove]
# create dataframe for display purposes in the project summary page
df_full_display <- df_full
#remove columns where their variance is 0 - meaning all values are the same
df_full <- df_full[,apply(df_full, 2, var, na.rm=TRUE) != 0]
#factorialize change variable into "PositiveChange"
df_full$PositiveChange <- df_full$change %in% c(2)
df_full$PositiveChange <- factor(df_full$PositiveChange, levels=c(T, F), labels = c("Positive Change", " Not Positive Change"))
# subset out missing values from mhstatus which is roughly 30% of participants (835) =(
# and remove haven attributes
df_full <- as.data.frame(zap_labels(df_full[!is.na(df_full$mhstatus), ]))
df_full_na <- df_full
#change all NA values to -1 to indicate missing
df_full <- df_full %>% replace(is.na(.), -1)
#factorialize mhstatus and rename levels for increased legibility and create a separate vector as mhstatus will be removed
mhstatus <- factor(df_full$mhstatus, levels=c(0:8, -1), labels = c("Languishing to Languishing", "Languishing to Moderate", "Languishing to Flourishing", "Moderate to Languishing", "Moderate to Moderate", "Moderate to Flourishing", "Flourishing to Languishing", "Flourishing to Flourishing", "Flourishing to Flourishing", "Missing"))
#remove additional features directly related to PositiveChange
#remove TA110835 too many values that are NA
#additionally remove other fields "not used"
filter_remove_addon <- c("change", "mhstatus", "flscale_02", "cdsiiresult", "CHLDREL",
                   "stratum", "cluster", "weight", "ER33601", "ER33602", "ER34001", "ER34002","TA090001", "ER47301", "ER34101", "ER34102", "TA110001", "TA110007", "TA110009", "TA111121", "Q23IWMTH", "Q23IWDAY", "Q23IWYR", "Q24MONTH", "Q24YEAR")

filter_save_interval <- c("grade_02", "Q23K14A", "Q23K14B", "Q23K14C", "Q23K14D", "Q23K14E", "Q23K14F", "Q23K14G", "Q23K15", "Q23K16", "Q23K17", "Q23K21", "Q23L11A", "Q23L11B", "Q23L11C", "Q23L11D", "Q23L11E", "Q23L11F", "Q23L11G", "Q23L11H", "Q23L11I", "Q23L11J", "Q23L30A", "Q23L30B", "Q23L31", "ewb1_02", "ewb2_02", "ewb3_02", "swb1_02", "swb2_02", "swb3_02", "swb4_02", "swb5_02", "pwb1_02", "pwb2_02", "pwb3_02", "pwb4_02", "BULLY02", "GLBCN02", "SOCINT02", "Q24BMI", "Q24B2", "Q24IWAGE", "WTIND02", "news_2011", "read_2011", "email_2011", "internetsch_2011", "internetshop_2011", "internetgame_2011", "socialnetwork_2011", "tv_2011", "ownactions", "probsolve", "moneymng", "creditcardmng", "supervise_2011", "leader_2011", "logical_2011", "helping_2011", "intelligence_2011", "independence_2011", "confidence_2011", "decisiveness_2011", "listening_2011", "teaching_2011", "meetppl_11", "shy_11", "selfcon_11", "perform_11", "romsat_2011", "children_2011", "closef_2011", "closem_2011", "earnlvg_2011", "payrent_2011", "paybills_2011", "mangmon_2011", "helping_2011", "TA110826", "nervous_2011", "hopeless_2011", "restless_2011", "effort_2011", "sad_2011", "worthless_2011", "TA110893", "sleep11", "snack11", "bingeeat11", "cigct11", "alcfreq_11", "alc45drinks_11", "TA110974", "courtesy_2011", "service_2011", "stupid_2011", "afraid_2011", "dishonest_2011", "superior_2011", "respect_2011", "peerjob_2011", "peercoll_2011", "peerue_2011", "peermarried_2011", "peervotech_2011", "peerprnt_2011", "peerdrunk_2011", "peerdrug_2011", "TA111023", "TA111025", "TA111026", "danger_2011", "property_2011", "fight_2011", "dui_2011", "relig_freq", "ewb1_2011","ewb2_2011", "ewb3_2011", "swb1_2011", "swb2_2011", "swb3_2011", "swb4_2011", "swb5_2011", "pwb1_2011", "pwb2_2011", "pwb3_2011", "pwb4_2011", "pwb5_2011", "pwb6_2011", "finanrespscale", "worryscale_11", "socanxscale_11", "TA111121", "TA111125", "riskybxscale_11", "bmi_2011", "enrollstat_11", "mothered_11", "fathered_11", "yaearnings_10", "ER33606", "TA111060")

df_full <- df_full[ , !names(df_full) %in% filter_remove_addon]
df_full_na <- df_full_na[ , !names(df_full_na) %in% filter_remove_addon]
df_full_display <- df_full_display[ , !names(df_full_display) %in% filter_remove_addon]

df_save_onehot <- df_full[ , names(df_full) %in% filter_save_interval]
df_save_onehot <- as.data.frame(lapply(df_save_onehot, scale))
df_full_onehot <- df_full[ , !names(df_full) %in% filter_save_interval]
df_full_onehot <- lapply(df_full_onehot, as.factor)
df_full_onehot <- data.frame(predict(dummyVars(" ~ .", data = df_full_onehot), newdata = df_full_onehot)) 
df_full_onehot <- cbind(df_full_onehot, df_save_onehot)

#create another df removing emotions that contribute to flourishing.  only keep physical contributing factors to flourishing
filter_remove_subjective <- c("ewb1_2011", "ewb2_2011", "ewb3_2011", "swb1_2011", "swb2_2011", "swb3_2011", "swb4_2011", "swb5_2011", "pwb1_2011", "pwb2_2011", "pwb3_2011", "pwb4_2011", "pwb5_2011", "pwb6_2011", "nervous_2011", "hopeless_2011", "restless_2011", "effort_2011", "sad_2011", "worthless_2011", "k6sxfreq_2011", "degreefreq_2011", "k6interfere_2011", "ownactions", "probsolve", "moneymng", "creditcardmng", "supervise_2011", "leader_2011", "logical_2011", "helping_2011", "intelligence_2011", "independence_2011", "confidence_2011", "decisiveness_2011", "listening_2011", "teaching_2011", "meetppl_11", "shy_11", "selfcon_11", "perform_11", "lifesat_2011", "ewb1_02", "ewb2_02", "ewb3_02", "swb1_02", "swb2_02", "swb3_02", "swb4_02", "swb5_02", "pwb1_02", "pwb2_02", "pwb3_02", "pwb4_02", "Q23K26A", "Q23K26B", "Q23K26C", "Q23K26D", "Q23K26E", "Q23K28", "TA111121")

df_objective <- df_full[ , !names(df_full) %in% filter_remove_subjective]
df_objective_display <- df_full_display[ , !names(df_full_display) %in% filter_remove_subjective]
df_objective_na <- df_full_na[ , !names(df_full_na) %in% filter_remove_subjective]
df_objective_onehot <- df_full_onehot[ , !names(df_full_onehot) %in% filter_remove_subjective]
# df_objective_onehot <- lapply(df_objective_onehot, as.factor)
# df_objective_onehot <- data.frame(predict(dummyVars(" ~ .", data = df_objective_onehot), newdata = df_objective_onehot)) 

#scale full and ojbective datasets
df_full_scaled <- as.data.frame(lapply(df_full, as.numeric))
df_full_scaled <- as.data.frame(lapply(df_full_scaled, scale))
df_objective_scaled <- as.data.frame(lapply(df_objective, as.numeric))
df_objective_scaled <- as.data.frame(lapply(df_objective_scaled, scale))

df_full_na_scaled <- as.data.frame(lapply(df_full_na, as.numeric))
df_full_na_scaled <- as.data.frame(lapply(df_full_na_scaled, scale))
df_objective_na_scaled <- as.data.frame(lapply(df_objective_na, as.numeric))
df_objective_na_scaled <- as.data.frame(lapply(df_objective_na_scaled, scale))

#create training / testing data dataframes
set.seed(1234)
subsetInterval <- sample(nrow(df_full_scaled), floor(nrow(df_full_scaled) * 0.80))  # 80% training + 20% testing

#unscaled
df_full_train <- df_full[subsetInterval, ]
df_full_test <- df_full[-subsetInterval, ]
df_full_train_na <- df_full_na[subsetInterval, ]
df_full_test_na <- df_full_na[-subsetInterval, ]
df_full_train_onehot <- df_full_onehot[subsetInterval, ]
df_full_test_onehot <- df_full_onehot[-subsetInterval, ]
df_objective_train <- df_objective[subsetInterval, ]
df_objective_test <- df_objective[-subsetInterval, ]
df_objective_train_na <- df_objective_na[subsetInterval, ]
df_objective_test_na <- df_objective_na[-subsetInterval, ]
df_objective_train_onehot <- df_objective_onehot[subsetInterval, ]
df_objective_test_onehot <- df_objective_onehot[-subsetInterval, ]

#scaled
df_full_train_scaled <- df_full_scaled[subsetInterval, ]
df_full_test_scaled <- df_full_scaled[-subsetInterval, ]
df_objective_train_scaled <- df_objective_scaled[subsetInterval, ]
df_objective_test_scaled <- df_objective_scaled[-subsetInterval, ]

df_full_train_na_scaled <- df_full_na_scaled[subsetInterval, ]
df_full_test_na_scaled <- df_full_na_scaled[-subsetInterval, ]
df_objective_train_na_scaled <- df_objective_na_scaled[subsetInterval, ]
df_objective_test_na_scaled <- df_objective_na_scaled[-subsetInterval, ]

mhstatus_train <- mhstatus[subsetInterval]
mhstatus_test <- mhstatus[-subsetInterval] 

#common variables
k_full <- as.integer(sqrt(nrow(df_full)/2))
k_full_train <- as.integer(sqrt(nrow(df_full_train_scaled)/2))
k_objective <- as.integer(sqrt(nrow(df_objective)/2))
k_objective_train <- as.integer(sqrt(nrow(df_objective_train_scaled)/2))

k_full_onehot <- as.integer(sqrt(nrow(df_full_onehot)/2))
k_full_train_onehot <- as.integer(sqrt(nrow(df_full_train_onehot)/2))
k_objective_onehot <- as.integer(sqrt(nrow(df_objective_onehot)/2))
k_objective_train_onehot <- as.integer(sqrt(nrow(df_objective_train_onehot)/2))

c50_trials_full <- 3
c50_trials_objective <- 3
######################################################
## below is to set testing data as training data to speed up rendering
## set currently_developing to F when ready to go live.
######################################################
currently_developing <-F
if(currently_developing) {
  df_full_train_scaled <- df_full_test_scaled
  df_full_train_na_scaled <- df_full_test_na_scaled
  df_full_train_onehot <- df_full_test_onehot
  df_objective_train_scaled <- df_objective_test_scaled
  df_objective_train_na_scaled <- df_objective_test_na_scaled
  df_objective_train_onehot <- df_objective_test_onehot
  
  df_full_scaled <- df_full_test_scaled
  df_full_na_scaled <- df_full_test_na_scaled
  df_full_onehot <- df_full_test_onehot
  df_objective_scaled <- df_objective_test_scaled
  df_objective_na_scaled <- df_objective_test_na_scaled
  df_objective_onehot <- df_objective_test_onehot

  c50_trials_full <- 1
  c50_trials_objective <- 1
}

#### Functions
# given the obscure fieldname, return its description
fieldname_to_description <- function(colname) {
  if(!is.na(colname) && length(colname) > 0) {
    returnValue = str_to_sentence(attr(tbl_dta[[colname]], "label"))
    if(is.na(returnValue) || length(returnValue) == 0)
      returnValue = colname
  }
  else
    returnValue = ""
  return(returnValue)
}

# given the nominal value, and the obscure feature name, return the descriptive value
value_fieldname_to_description <- function(value, colname) {
  values <- unname(val_labels(unique(to_labelled(tbl_dta)[[colname]])))
  value_names <- names(val_labels(unique(to_labelled(tbl_dta)[[colname]])))
  returnvValue = value
  if(is.na(value))
    returnvValue = ""
  else {
    for(i in 1:length(values)) {
      if(startsWith(value, ".")) {
        returnvValue = "."
      }
      else if(!is.null(values[i]) && !is.na(values[i]) && value == values[i]) {
        returnvValue = str_to_sentence(value_names[i])
      }
    }    
  }
  return(returnvValue)
}