Script_HPP_Pilot_PA_Preproc.r

#### Code_pre-proc_ProlificAcad_HPP####
#
### Purpose ###
# This script is for pre-processing the pilot data from prolific Academia as reported in IJzerman et al.(2018), Human Penguin Project (HPP).
# Overview of HPP: https://osf.io/2rm5b/
#
#
# Code author: Chuan-Peng Hu, PhD, 
# Affliated to: Neuroimaging Center (NIC), Johannes Gutenberg University Medical Center, 55131 Mainz, Germany;
# Email: hcp4715@gmail.com
# 
# Author   Date (d/m/y) Notes/Changes
# ========   =========   ========
# C-P. Hu    09/08/18    split the preprocess into two file: preproc, and score calculating, this one is preproc.
#
#
### input data ####
#
# Oringinal data: sav file: 'prolific academic corrected dataset december 2015.sav' 
#
# Revised data: 'prolific_academic_corrected_201512_rev_yjx2_3.csv' (with codebook 'Codebook_HPP_prolific_academic_0619.xlsx')
#       We thanks Jixin Yin for check the data and prepare the code book.
#
# Also, the weight, height, avghumid, and mintemp data were from the data reported in previous version of the manuscript:pilotpenguins .sav
# 
### output file and Variables ####
#
# output file: 'Data_Raw_HPP_Pilot_PA_Share.csv'
# 
# Data related to the following variables are kept (reported in the article):
# Age            -- using 2015 minus the birth year.
# anxiety        -- subscale of attachment, Fraley et al., 2000,  using sum score
# attachhome     -- attachment to home; Harris et al., 1996       using average score
# attachphone    -- attachment to the phone                       using sum score
# avghumid       -- average humidity of the day                   
# avgtemp        -- average temperature
# avoidance      -- subscale of attachment, Fraley et al., 2000   using sum score
# gluctot        -- daily sugary drink consumption, Henriksen et al., 2014
# health         -- health condition 
# height         -- height (in meter)
# Medication     -- medication condition
# mintemp        -- minimum temperature of the day
# networksize    -- social network; Cohen et al., 1997
# nostalgia      -- (Routledge et al., 2008)                       using sum score
# selfcontrol    -- self-control, Tangney et al., 2004             using sum score
# Sex 
# site           -- the plocation of participants
# smoking        -- 
# socialdiversity--
# socialembedded -- social network; Cohen et al., 1997
# stress         -- Perceived stress (Cohen & Wills, 1985)         using sum score
# weight         -- wightkg

### Preparing ####
Sys.setlocale("LC_ALL", "English")  # set local encoding to English
Sys.setenv(LANG = "en") # set the feedback language to English

rm(list = setdiff(ls(), lsf.str())) # remove all variables except functions

# packages, if not exist, install.
pkgTest <- function(x){
        if (!require(x,character.only = TRUE)){
                install.packages(x,dep = TRUE)
                if(!require(x,character.only = TRUE)) stop("Package not found")
        }
}
pkgNeeded <- c("psych",'tidyverse','foreign')
lapply(pkgNeeded,pkgTest)
rm('pkgNeeded') # remove the variable 'pkgNeeded';

#### Preprocessing #####
# Load data
DataRaw <- read.csv("prolific_academic_corrected_201512_rev_yjx2_3.csv", header = TRUE,sep = ',', stringsAsFactors=FALSE,na.strings=c(""," ","NA"))

## read the data from previous validated
repoData_PA_s <- read.csv("reportedPA_rm_diff_newer.csv", header = TRUE,sep = ',', stringsAsFactors=FALSE,na.strings=c(""," ","NA"))
repoData_PA_s_reord <- repoData_PA_s[with(repoData_PA_s, order(age, anxiety,avoidance)), ] # order based on "age", "anxiety", and "avoidance"

# recode the temperature:
# there was one participants filled 32 for Q7 and 2 for Q8, resulted 0 for t1; however, the results of Q66 was 1, 
# and again the answer for Q65 was 32. so here I change the answer for Q8 as 1.
DataRaw$Q8[DataRaw$Q7 == 32 & DataRaw$Q8 == 2] <- 1

# DataRaw$Temperature_t1_r <- if (any( DataRaw$Q8 == 2)) (((DataRaw$Q7-32)*5)/9) else DataRaw$Q7
DataRaw$Temperature_t1_r <- DataRaw$Q7

# transfer the temperature at T1 to the same scale
for (ii in 1:length(DataRaw$Q8)){
  if (DataRaw$Q8[ii] ==2){
    DataRaw$Temperature_t1_r[ii] <- ((DataRaw$Q7[ii]-32)*5)/9
  }
  else DataRaw$Temperature_t1_r[ii] <- DataRaw$Q7[ii]
}

# DataRaw$Temperature_t2_r <- if (any(DataRaw$Q66 == 2)) (((DataRaw$Q65-32)*5)/9) else DataRaw$Q65
DataRaw$Temperature_t2_r <- DataRaw$Q65
DataRaw$Q66r <- DataRaw$Q66
DataRaw$Q66r[is.na(DataRaw$Q66r)] <- 0

# transfer the temperature at T2 to the same scale
for (ii in 1:length(DataRaw$Q66r)){
  if (DataRaw$Q66r[ii] ==2){
    DataRaw$Temperature_t2_r[ii] <- ((DataRaw$Q65[ii]-32)*5)/9
  }
  else DataRaw$Temperature_t2_r[ii] <- DataRaw$Q65[ii]
}


DataRaw$avgtemp_r <- rowSums(DataRaw[,c('Temperature_t1_r','Temperature_t2_r')],na.rm = T)/2
# correct the value for participatn with NA for Q65

DataRaw$avgtemp_r[is.na(DataRaw$Q65)] <- DataRaw$Temperature_t1_r[is.na(DataRaw$Q65)]  

# unify the birth year
DataRaw$birthyear <- as.integer(paste("19",as.character(round(DataRaw$Q87,2)),sep = ''))

# exclude participants
# criteria: 
# c1: average temperation is greater than 34.99
# c2: not drink or eat somethin cold or warm in 10 minutes before (eatdrink = 1)
# c3: no exercise in 60 mintues before the survey (exercise = 2)

# first: filter eatdrinking
valid.data_Eat <- subset(DataRaw, eatdrink != 1)   # eat or drink  (3 participants)
valid.data_NA <- subset(DataRaw, is.na(eatdrink))  # eat or drink data is NA (2 participants)
valid.data_NoEat <- subset(DataRaw, eatdrink == 1) # No eat of drink

# Second: filter exercise
valid.data_exercise <- subset(valid.data_NoEat, exercise != 2) # did exercise within one hour (2 participants)
valid.data_exercise_NA <- subset(valid.data_NoEat, is.na(exercise))
valid.data_NoExercise <- subset(valid.data_NoEat, exercise == 2) # did exercise within one hour

# Third: filter average temperature
valid.data_Tmp <- subset(valid.data_NoExercise, avgtemp_r < 34.99)  # participant that not excluded by the other two criteria (1 participant)

valid.data <- subset(DataRaw,avgtemp_r > 34.99 & eatdrink == 1 & exercise == 2) # average temperature higher than 34.99 is valid

# criteria: T1 is greater than 34.99
#valid.data1 <- subset(DataRaw,Temperature_t1_r > 34.99)
# criteria: T2 is greater than 34.99
#valid.data2 <- subset(DataRaw,Temperature_t2_r > 34.99)
# criteria: T1 & T2 is greater than 34.99
#valid.data3 <- subset(DataRaw,Temperature_t2_r > 34.99 & Temperature_t1_r > 34.99 )
# criteria: T1 or T2 or average is greater than 34.99
#valid.data4 <- subset(DataRaw,Temperature_t2_r > 34.99 | Temperature_t1_r > 34.99 | avgtemp_r > 34.99)

valid.data$age <- valid.data$birthyear # calcuate the age for each participant

# calculated the anxiety and attachhome score for re-ordering
ECRanxietyNames <- c( "ECR1", "ECR2", "ECR3", "ECR4","ECR5", "ECR6", "ECR7", "ECR8", "ECR9", "ECR10", "ECR11",
                      "ECR12","ECR13","ECR14","ECR15","ECR16", "ECR17","ECR18")
ECRanxietyKeys  <- c(1,2,3,4,5,6,7,8,-9,10,-11,12,13,14,15,16,17,18) # reverse coded as negative
ECRanxietyKeys2 <- c("ECR1", "ECR2", "ECR3", "ECR4","ECR5", "ECR6", "ECR7", "ECR8", "-ECR9", "ECR10", "-ECR11",
                     "ECR12","ECR13","ECR14","ECR15","ECR16", "ECR17","ECR18")
ECRanxietyScore <- psych::scoreItems(ECRanxietyKeys2,valid.data[,ECRanxietyNames], totals = T, min = 1, max = 7) 
valid.data$anxiety <-  ECRanxietyScore$scores   # sum score

## score and alpha for ECR avoidance ####
ECRavoidanceNames <- c( "ECR19","ECR20","ECR21","ECR22","ECR23","ECR24","ECR25","ECR26","ECR27","ECR28","ECR29",
                        "ECR30","ECR31","ECR32","ECR33", "ECR34","ECR35","ECR36")
ECRavoidanceKeys <- c(1,-2,3,-4,5,6,7,-8,-9,-10,-11,-12,-13,14,-15,-16,-17,-18) # reverse coded as negative
ECRavoidanceKeys2 <- c("ECR19","-ECR20","ECR21","-ECR22", "ECR23","ECR24","ECR25","-ECR26","-ECR27",
                       "-ECR28","-ECR29","-ECR30","-ECR31","ECR32","-ECR33", "-ECR34","-ECR35","-ECR36")

ECRavoidanceAlpha <- psych::alpha(valid.data[,ECRavoidanceNames], 
                                  keys=ECRavoidanceKeys)  # calculate the alpha coefficient 
print(ECRavoidanceAlpha$total)  # std. alpha 0.9451, 
ECRavoidanceScore <- psych::scoreItems(ECRavoidanceKeys2,valid.data[,ECRavoidanceNames], totals = T, min = 1, max = 7)
#Datasum$ECRavoidance <- ECRavoidanceScore$scores # average score
valid.data$avoidance <- ECRavoidanceScore$scores # sum score


## score and alpha for attachemnt to home
homeNames <- c( "HOME1","HOME2","HOME3","HOME4","HOME5","HOME6","HOME7","HOME8","HOME9" )
homeKeys <- c(1,2,3,4,5,6,7,8,9) # reverse coded as negative
valid.data$attachhome <- rowSums(valid.data[,homeNames],na.rm = T)/length(homeNames)

## re-order the data
valid.data_reord <- valid.data[with(valid.data, order(age, anxiety,avoidance)), ] # order based on "age", "anxiety", and "avoidance"

## save the useful variable for later open data
SNINames <- paste("SNI",1:32,sep = '') # colnames for social network indices
scontrolNames <- c("scontrol1","scontrol2","scontrol3" ,"scontrol4","scontrol5" , "scontrol6" , 
                   "scontrol7","scontrol8", "scontrol9", "scontrol10", "scontrol11" ,"scontrol12", "scontrol13" )
stressNames <- c("stress1" , "stress2" ,"stress3","stress4", "stress5", "stress6", "stress7", "stress8", "stress9", "stress10",
                 "stress11", "stress12", "stress13","stress14")
phoneNames <- c( "phone1", "phone2","phone3", "phone4","phone5", "phone6","phone7","phone8","phone9")
onlineNames <- c( "onlineid1", "onlineid2","onlineid3","onlineid4", "onlineid5", "onlineid6","onlineid7","onlineid8",
                  "onlineid9", "onlineid10", "onlineide11")
ECRNames <- c( "ECR1", "ECR2", "ECR3", "ECR4","ECR5", "ECR6", "ECR7", "ECR8", "ECR9", "ECR10", "ECR11",
               "ECR12","ECR13","ECR14","ECR15","ECR16", "ECR17","ECR18","ECR19","ECR20","ECR21","ECR22",
               "ECR23","ECR24","ECR25","ECR26","ECR27","ECR28","ECR29","ECR30","ECR31","ECR32","ECR33",
               "ECR34","ECR35","ECR36")
nostagliaNames <- c( "SNS1" ,"SNS2","SNS3","SNS4", "SNS5","SNS6" ,"SNS7" )
homeNames <- c( "HOME1","HOME2","HOME3","HOME4","HOME5","HOME6","HOME7","HOME8","HOME9" )

didfNames <- c("ALEX1","ALEX2","ALEX3","ALEX4","ALEX5" ,"ALEX6", "ALEX7", "ALEX8", "ALEX9" ,"ALEX10","ALEX11")
eotNames <- c("ALEX12","ALEX13","ALEX14","ALEX15" ,"ALEX16")

OtherNames <- c('birthyear','Sex','avgtemp','health')

selectNames <- c(OtherNames,SNINames,scontrolNames,stressNames,phoneNames,onlineNames,ECRNames,homeNames,nostagliaNames,didfNames,eotNames)

valid.data_share <- valid.data_reord[,selectNames]

# newNames <- c('glucoseplosone','Site','avghumid','mintemp')
## gluctot and artgluctot (already calculated in multi-site dataset)

valid.data_share$glucoseplosone <- rowSums(valid.data_reord[,c("Q89_6_1_TEXT",'Q89_7_1_TEXT','Q89_12_1_TEXT')],na.rm = T)
#Datasum$artgluctot <- rowSums(valid.data[,c("Q89_8_1_TEXT",'Q89_9_1_TEXT','Q89_13_1_TEXT')],na.rm = T)
valid.data_share$Site       <- "ProlificAcademic"
valid.data_share$birthyear  <- valid.data_reord$birthyear
valid.data_share$avgtemp    <- valid.data_reord$avgtemp_r
#valid.data_share$Medication <- valid.data_reord$meds
valid.data_share$Smoking    <- valid.data_reord$smoke

# from osf reported data
valid.data_share$avghumid <- repoData_PA_s_reord$avghumid
valid.data_share$mintemp  <-  repoData_PA_s_reord$mintemp
#valid.data_share$heightm  <- repoData_PA_s_reord$heightm
#valid.data_share$weightkg <- repoData_PA_s_reord$weightkg
#valid.data_share <-valid.data_share[,order(names(valid.data_share))] # order columns by alphabeta 
valid.data_share <- valid.data_share %>%
        dplyr::select("Site","birthyear","Sex","health", "avgtemp","glucoseplosone", "Smoking",
                      "avghumid","mintemp",everything())

# write the sharable data
write.csv(valid.data_share,'Data_Raw_HPP_Pilot_PA_Share.csv',row.names = F)