Chapter 3 US preprocessing
# Function to read adolescents' data from one file
<- function(filename){
read_adolescent_data # Select relevant variables, define NAs (any negative numbers), remove "yp" from the beginning of variable names
<- read_spss(filename)
dataset <- dataset %>%
dataset select(-ends_with("_sex")) %>%
rename_all(~str_replace(., "yp", "")) %>%
select(
contains("pidp"), # Participant ID
ends_with("age_dv"), # Participant age
ends_with("socweb"), # Do you have social media profile?
ends_with("netcht"), # Hours using social media on weekdays
ends_with("tvvidhrs"), # TV on weekdays (all waves)
contains("sdq"), # Strengths and difficulties
-ends_with("_orig"),
-ends_with("sdqes_dv"),
-ends_with("sdqcp_dv"),
-ends_with("sdqha_dv"),
-ends_with("sdqpp_dv"),
-ends_with("sdqps_dv"),
-ends_with("sdqtd_dv")
)is.na(dataset[, ]) <- dataset[, ] < 0 #define NAs
# Remove dataset indicator letter from variable names so that datasets can be merged
names(dataset) <- str_remove(names(dataset), "[a-z]_")
# Get rid of number labels
<- zap_labels(dataset)
dataset return(dataset)
}
# Load the adolescent data from the folders downloaded from the UK data service.
<- list.files("data-raw/us/", pattern = "_youth.sav", recursive = TRUE, full.names = TRUE)
files_adolescent <- files_adolescent[str_detect(files_adolescent, "ukhls")]
files_adolescent <- map(files_adolescent, read_adolescent_data)
data # Combine adolescent data from different waves (files) into long format
# Removes SPSS labels from variables & values
<- bind_rows(data, .id = "wave") data
# Exclude any teenagers from the adolescent data that are under 10 or over 15
<- data %>%
data filter(between(age_dv, 10, 15)) %>%
select(-age_dv)
# Get sex from "stable characteristics" file
# In Understanding Society that have a specific file "ukhls_wx/xwavedat.sav" which indicates stable characteristics of participants in the survey. We prefer to use this for sex because it is derived from multiple interviews and checked for consistency.
<- list.files(
data_stable "data-raw/us/",
pattern = "xwavedat.sav", recursive = TRUE, full.names = TRUE
%>%
) read_spss(col_select = c("pidp", "sex_dv"))
is.na(data_stable[, ]) <- data_stable[, ] < 0 # define NAs as anything under 0
$sex_dv <- as_factor(data_stable$sex_dv) %>% fct_drop()
data_stable<- zap_labels(data_stable)
data_stable
<- left_join(data, data_stable, by = "pidp")
data <- rename(data, Sex = sex_dv) data
Sex: Take out (10) inconsistent responses
prop.table(table(data$Sex, useNA = "always"))
<- filter(data, Sex != "inconsistent")
data <- data %>%
data mutate(Sex = factor(Sex, levels = c("Female", "Male")))
table(data$Sex, useNA = "always")
# Contrast code sex
contrasts(data$Sex) <- matrix(c(.5, -.5))
# Rename variables
<- data %>%
data rename(
socialmedia = socweb,
socialmediaamount = netcht,
TV = tvvidhrs
)
We first clean and recode the social media use variable. This variable is complicated as there was a mistake on the side of Understanding Society, who were inconsistent in how they directed children and adults through the study in different years. Sometimes those who said they dont use social media on “netcht” were automatically coded as none on “socweb” and sometimes this wasnt the case. There is no easy solution for this, so we went with a recoding solution detailed below. If a participant said they do not own a social media account or they dont use social media to interact with friends we coded them as 1, for the rest of the participants we took their netcht score. This creates the following scale, which is ordinal in nature: 1 = None, 2 = Less than an hour, 3 = 1-3 hours, 4 = 4-6 hours, 5 = 7 or more hours
# set 9 to NA to revert coding error
is.na(data[, c("socialmediaamount", "socialmedia")]) <- data[, c("socialmediaamount", "socialmedia")] == 9
$socialmedia <- ifelse((data$socialmedia == 2 | data$socialmediaamount == 1), 1,
dataifelse(data$socialmediaamount == 2, 2,
ifelse(data$socialmediaamount == 3, 3,
ifelse(data$socialmediaamount == 4, 4,
ifelse(data$socialmediaamount == 5, 5, NA)))))
<- data %>% select(-socialmediaamount)
data <- rename(data, SM = socialmedia) data
3.1 Year
# Convert wave to year
# Waves 1-9 to years 2009 ->
<- rename(data, Year = "wave")
data $Year <- as.numeric(data$Year) + 2008 data
3.2 Strengths and difficulties
Our main outcome. It is only recorded every second year so we take out the years where it was not recorded.
<- data %>%
data # SDQ only measured in these waves
filter(Year %in% c(2009, 2011, 2013, 2015, 2017))
Recode: Greater values indicate greater difficulties
<- data %>%
data mutate_at(
paste0("sdq", c("a", "d", "g", "i", "k", "n", "q", "t", "u", "y")),
function(x) 4 - x
)
SDQ subscales: http://sdqinfo.org/c9.html
<- c("sdqc", "sdqh", "sdqm", "sdqp", "sdqx")
sdq_emo <- c("sdqe", "sdqg", "sdql", "sdqr", "sdqv")
sdq_con # Compute means of two subscales
<- data %>%
data mutate(
Emotion = rowMeans(select(., all_of(sdq_emo)), na.rm=T),
Conduct = rowMeans(select(., all_of(sdq_con)), na.rm=T)
) # Remove all but these two subscales from SDQ variables
<- select(data, starts_with("sdq"))
sdq <- select(data, !starts_with("sdq"))
data <- bind_cols(
data
data,%>% select(all_of(sdq_emo), all_of(sdq_con))
sdq
)rm(sdq)
3.3 Save file
<- data %>%
data # Drop all rows where sex or year is missing
drop_na(Sex, Year)
# Save file to disk
saveRDS(data, "data/us.rds")
options(width = 120)
library(sessioninfo)
session_info()
## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
## setting value
## version R version 4.0.3 (2020-10-10)
## os macOS Big Sur 10.16
## system x86_64, darwin17.0
## ui X11
## language (EN)
## collate en_GB.UTF-8
## ctype en_GB.UTF-8
## tz Europe/London
## date 2021-03-01
##
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
## package * version date lib source
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
## bookdown 0.21.6 2021-03-01 [1] Github (rstudio/bookdown@ca0145f)
## bslib 0.2.4 2021-01-25 [1] CRAN (R 4.0.3)
## cli 2.3.1 2021-02-23 [1] CRAN (R 4.0.3)
## digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.2)
## evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
## glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
## htmltools 0.5.1.1 2021-01-22 [1] CRAN (R 4.0.2)
## jquerylib 0.1.3 2020-12-17 [1] CRAN (R 4.0.2)
## jsonlite 1.7.2 2020-12-09 [1] CRAN (R 4.0.2)
## knitr 1.31 2021-01-27 [1] CRAN (R 4.0.2)
## magrittr 2.0.1 2020-11-17 [1] CRAN (R 4.0.2)
## R6 2.5.0 2020-10-28 [1] CRAN (R 4.0.2)
## rlang 0.4.10 2020-12-30 [1] CRAN (R 4.0.2)
## rmarkdown 2.7.2 2021-03-01 [1] Github (rstudio/rmarkdown@9bfaf4a)
## sass 0.3.1 2021-01-24 [1] CRAN (R 4.0.2)
## sessioninfo * 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
## stringi 1.5.3 2020-09-09 [1] CRAN (R 4.0.2)
## stringr 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
## withr 2.4.1 2021-01-26 [1] CRAN (R 4.0.2)
## xfun 0.21 2021-02-10 [1] CRAN (R 4.0.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
##
## [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library