3 Process AC:NH data
Here, we process the AC:NH survey and telemetry files.
We used the following R packages:
library(pacman)
p_load(
readxl,
knitr,
here,
lubridate,
scales,
janitor,
tidyverse )
3.1 Raw data
This script expects the raw data files to be in data-raw/noa/
. The raw telemetry and survey files are in a zip compressed file on OSF: https://osf.io/cjd6z/. Below, we provide code that downloads and unpacks these files.
The raw survey data was edited to exclude test sessions, any data from individuals who did not consent / below 18, and some unnecessary variables, before uploading to OSF. The code to do that is shown below (but cannot be executed without our formr credentials)
# Dont download if already downloaded
if (!file.exists(here("data-raw/noa/formr-raw.rds"))) {
# Connect to formr
::formr_connect()
formr# These are the survey names on formr
<- c("gaming_welcome", paste0("gaming_block", 2:5))
surveys # Download data into a list of data frames
<- map(surveys, ~formr::formr_results(.) %>% as_tibble)
ac saveRDS(ac, here("data-raw/noa/formr-raw.rds"))
else {ac <- readRDS(here("data-raw/noa/formr-raw.rds"))}
}
# Take out all intermediate time variables
1]] <- select(ac[[1]], -c(modified:expired))
ac[[2:4] <- map(ac[2:4], ~select(., -c(created:expired)))
ac[5]] <- select(ac[[5]], -c(created, modified, expired))
ac[[
# Transform to one data frame with one row per participant
<- ac %>% reduce(left_join)
ac
# Limit data to survey window to exclude our test sessions
<- ac %>%
ac filter(created >= ymd("2020-10-27")) %>%
filter(created <= ymd("2020-10-27") + days(7))
range(ac$created)
# Keep only surveys whose code exists and consents check out
<- filter(
ac !is.na(code), of_age==1, consent_data==1, consent==1
ac,
)
# Take out unnecessary variables
<- ac %>%
ac select(
-c(session, browser, last_outside_referrer, of_age:consent)
)write_rds(ac, here("data-raw/noa/formr.rds"))
The file produced by the above code chunk was uploaded to OSF in a zip compressed archive with the telemetry file.
Start by downloading that file and unpacking it to the target directory by running the code below:
<- here("data-raw/noa/noa.zip")
file_dest # Download & extract file only if you haven't yet
if (!file.exists(file_dest)) {
download.file("https://osf.io/fev95/download", file_dest)
}if (!file.exists(here("data-raw/noa/formr.rds"))) {
unzip(file_dest, exdir = here("data-raw/noa/"))
}
Next, we load the survey data file.
<- readRDS(here("data-raw/noa/formr.rds")) ac
3.2 Process raw files
3.2.1 Clean survey data
Here, we clean the survey data. Specifically, we
- Harmonize names so that they are the same as in the other data set (i.e., from PvZ)
- Create duration variable for game time
- Give some sensible variable names
- Assign proper variable types
# Harmonize some names to PvZ names
<- ac %>%
ac rename(
gender = sex,
gender_other = sex_other,
player_id = code
)
# Duration of survey
<- ac %>%
ac mutate(
survey_duration = ended-created
)# Create variables for straightliners by checking if variance within a block of questions is zero
$straightliner_spane <- apply(
acselect(ac, starts_with("spane_") & !starts_with("spane_acnh")),
1, sd, na.rm = TRUE
)$straightliner_spane <- ac$straightliner_spane==0
ac$straightliner_motivations <- apply(
acselect(
ac, starts_with("autonomy_"),
starts_with("competence_"),
starts_with("related_"),
starts_with("enjoymen_"),
starts_with("extrinsic_")
), 1, sd, na.rm = TRUE
)$straightliner_motivations <- ac$straightliner_motivations==0
ac
# These are needed as factors
<- ac %>%
ac mutate(across(c(gender, played), as_factor))
# Reverse scored items
<- ac %>%
ac mutate(
across(
c(
related_not_close,
enjoyment_attention,
enjoymen_boring
),~ 8 - .x
) )
Next, let’s create mean indices for the scales like we did before. SPANE has positive affect, negative affect, and an affect balance score (subtract negative from positive).
# Need to rename SPANE item so it doesnt become confused with scale score name
<- rename(
ac
ac, spane_positiveItem = spane_positive,
spane_negativeItem = spane_negative
)
# General SPANE
<- ac %>%
ac mutate(
spane_positive = rowMeans(
select(
.,
spane_positiveItem,
spane_good,
spane_pleasant,
spane_happy,
spane_joyful,
spane_contented
),na.rm = TRUE
),spane_negative = rowMeans(
select(
.,
spane_negativeItem,
spane_bad,
spane_unpleasant,
spane_sad,
spane_afraid,
spane_angry
),na.rm = TRUE
),spane_balance = spane_positive - spane_negative
)
# Motivations
<- ac %>%
ac mutate(
autonomy = rowMeans(
select(., starts_with("autonomy")), na.rm = TRUE
),competence = rowMeans(
select(., starts_with("competence")), na.rm = TRUE
),relatedness = rowMeans(
select(., starts_with("related")), na.rm = TRUE
),enjoyment = rowMeans(
select(., starts_with("enjoymen")), na.rm = TRUE
),extrinsic = rowMeans(
select(., starts_with("extrinsic")), na.rm = TRUE
)
)
# SPANE because of playing AC:NH
<- ac %>%
ac mutate(
spane_game_positive = rowMeans(
select(
.,
spane_acnh_positive,
spane_acnh_good,
spane_acnh_pleasant,
spane_acnh_happy,
spane_acnh_joyful,
spane_acnh_contented
),na.rm = TRUE
),spane_game_negative = rowMeans(
select(
.,
spane_acnh_negative,
spane_acnh_bad,
spane_acnh_unpleasant,
spane_acnh_sad,
spane_acnh_afraid,
spane_acnh_angry
),na.rm = TRUE
),spane_game_balance = spane_game_positive - spane_game_negative
)
# Hours of estimated play
<- ac %>%
ac mutate(active_play_minutes = active_play_minutes / 60) %>%
mutate(active_play = rowSums(select(., starts_with("active_play")), na.rm = T))
3.2.2 Checking
First, we check how many rows per player there are.
count(ac, player_id, sort = T)
## # A tibble: 5,987 x 2
## player_id n
## <chr> <int>
## 1 "" 13
## 2 "7dadc6e88bc0b5b8" 3
## 3 "33ec57527b06af7b" 2
## 4 "48bb7bafae2e5c44" 2
## 5 "49f5299432fef738" 2
## 6 "5b9d217b3cbb39fa" 2
## 7 "677ca59a6ed1e5a8" 2
## 8 "7092634f86aa0992" 2
## 9 "842b41d5e20f2922" 2
## 10 "8bd8d7292b3a05b5" 2
## # … with 5,977 more rows
There are two kinds of problems:
- No ID was captured
- An ID was used more than once
For both cases, connecting to telemetry would be impossible (and wrong connections could be made in latter case), so we drop these cases.
<- add_count(ac, player_id) %>%
ac filter(n == 1) %>%
select(-n)
3.2.3 Telemetry
This file is in the ZIP archive.
<- read_tsv(here("data-raw/noa/telem_data (since Sep 2020).txt")) gt
Column definitions:
lc_recorded_at
= Session start date/timenc_recorded_at
= Session end date/timehashed_id
= Hashed account IDproduct_model
= Switch model game was played onoperation_mode
= Identifies handheld mode, TV modeduration
= Duration of session (seconds)storage_id
= Whether game is played off game card, SD card or internal system memoryapplication_id_hex
= Game’s hashed ID
We drop some unnecessary variables
<- select(
gt
gt,
hashed_id, contains("recorded"),
duration )
Then rename
names(gt) <- c(
"player_id", "session_start", "session_end", "Hours"
)
And turn duration into hours
$Hours <- gt$Hours/60/60 gt
Assume that timestamps are US Pacific as this was used to report data collection dates & times.
<- gt %>%
gt mutate(
across(contains("session"), ~mdy_hm(.x, tz = "US/Pacific"))
)glimpse(gt)
## Rows: 191,498
## Columns: 4
## $ player_id <chr> "be080cb5754884b3", "35a6c680bcf65615", "35a6c680bcf656…
## $ session_start <dttm> 2020-10-15 22:39:00, 2020-09-09 03:38:00, 2020-10-03 1…
## $ session_end <dttm> 2020-10-15 22:39:00, 2020-09-09 03:38:00, 2020-10-03 1…
## $ Hours <dbl> 0.67583333, 0.19111111, 0.67111111, 1.77750000, 1.76000…
3.2.4 Clean
We don’t need to limit to IDs who took the survey as NOA has already done that–these data only contain folks who filled in the survey.
We do need to limit the data to two weeks preceding the survey, and count session durations within that window. We therefore need to use the session start/end times to find out when the sessions happened.
Some processing is required to do that as the times can have noise due to e.g. players’ system times being incorrectly set. Thus, many start times are the same (or even later) than the end time:
%>%
gt mutate(
later_or_same_start = session_start >= session_end
%>%
) tabyl(later_or_same_start) %>%
adorn_pct_formatting()
## later_or_same_start n percent
## FALSE 13112 6.8%
## TRUE 178386 93.2%
However, session durations are not based on the device time (session times):
%>%
gt mutate(
duration = as.numeric(session_end - session_start)/60/60
%>%
) mutate(match = duration/60/60==Hours) %>%
tabyl(match) %>%
adorn_pct_formatting()
## match n percent
## FALSE 191494 100.0%
## TRUE 4 0.0%
And end and start times should be within a window.
<- gt %>%
tmp filter(
< ymd("2020-09-01") |
session_start < ymd("2020-09-01") |
session_end > ymd("2020-11-03") |
session_start > ymd("2020-11-03")
session_end %>%
) arrange(session_start) %>%
mutate(player_id = fct_inorder(player_id))
# Proportion of these bad dates
percent(nrow(tmp) / nrow(gt), .1)
## [1] "3.1%"
We filter out sessions that aren’t in the two weeks preceding each player’s survey. Note not all survey respondents have telemetry so the resulting table will be longer
# Get survey times from survey table
<- select(ac, player_id, created) %>%
gt left_join(gt)
# Limit telemetry sessions to appropriate time window
<- gt %>%
gt filter(session_start >= (created - days(14))) %>%
filter(session_start < created) %>%
filter(session_end < created)
We then summarize to total hours per person.
<- gt %>%
gt group_by(player_id) %>%
summarise(
Hours = sum(Hours),
n_sessions = n()
)
3.2.5 Join survey and telemetry
<- left_join(ac, gt) ac
3.2.6 Checking
People reported if they played AC:NH in the past 14 days. Lets summarise the players, number of players with telemetry, and mean hours, for these two groups
%>%
ac group_by(played) %>%
summarise(
Players = n(),
Missing_Hours = sum(is.na(Hours)),
Mean_Hours = mean(Hours, na.rm = TRUE)
)
## # A tibble: 3 x 4
## played Players Missing_Hours Mean_Hours
## <fct> <int> <int> <dbl>
## 1 I have played Animal Crossing: New Horizons … 4602 2017 11.7
## 2 I have **NOT** played Animal Crossing: New H… 1233 1114 2.09
## 3 <NA> 140 88 6.75
3.3 Exclusions
First save a file with no exclusions.
write_rds(ac, here("data/noa/ac.rds"))
3.3.1 Straightliners
We take out all individuals who straightlined (gave the same response to every item) through SPANE and motivations scales. (If only SPANE items existed, then we didn’t exclude.)
<- ac %>%
ac mutate(
straightliner =
& straightliner_motivations
straightliner_spane
) %>%
ac select(contains("straight")) %>%
group_by_all() %>%
count
## # A tibble: 7 x 4
## # Groups: straightliner_spane, straightliner_motivations, straightliner [7]
## straightliner_spane straightliner_motivations straightliner n
## <lgl> <lgl> <lgl> <int>
## 1 FALSE FALSE FALSE 4266
## 2 FALSE TRUE FALSE 7
## 3 FALSE NA FALSE 1215
## 4 TRUE FALSE FALSE 20
## 5 TRUE TRUE TRUE 8
## 6 TRUE NA NA 19
## 7 NA NA NA 440
%>%
ac tabyl(straightliner) %>%
adorn_pct_formatting()
## straightliner n percent valid_percent
## FALSE 5508 92.2% 99.9%
## TRUE 8 0.1% 0.1%
## NA 459 7.7% -
# filter() would also exclude NAs
<- filter(ac, !straightliner | is.na(straightliner)) ac
3.3.2 Outliers
Potential outliers. We replace all values that are more than 6SD away from the variable’s mean with NAs. As a consequence, individuals are excluded on an analysis-by-analysis case (so if has bad data relevant to that analysis or figure).
This is only done for a subset of variables (relavant to analyses; see below)
<- ac %>%
ac # These variables will be affected
pivot_longer(
c(
:Hours,
spane_positiveItem-played_with_others, -ended, -survey_duration,
-contains("straightliner")
)%>%
) group_by(name) %>%
mutate(z_value = as.numeric(scale(value)))
These are the numbers of people taken out of each variable (only variables that were affected are shown):
# This is what are taken out
%>%
ac summarise(
Extremes = sum(abs(z_value>=6), na.rm = TRUE),
Extremes_p = percent(Extremes/n(), accuracy = .01)
%>%
) filter(Extremes > 0)
## # A tibble: 5 x 3
## name Extremes Extremes_p
## <chr> <int> <chr>
## 1 active_play 40 0.67%
## 2 active_play_hours 37 0.62%
## 3 Hours 12 0.20%
## 4 spane_acnh_afraid 25 0.42%
## 5 spane_game_negative 4 0.07%
Code to do it:
<- ac %>%
ac mutate(value = ifelse(abs(z_value >= 6), NA, value)) %>%
select(-z_value) %>%
pivot_wider(names_from = "name", values_from = "value") %>%
ungroup()
3.4 Save files
write_rds(ac, here("data/noa/ac-excluded.rds"))
3.5 Session info
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
## [5] readr_1.4.0 tidyr_1.1.2 tibble_3.0.4 ggplot2_3.3.2
## [9] tidyverse_1.3.0 janitor_2.0.1 scales_1.1.1 lubridate_1.7.9.2
## [13] here_1.0.1 knitr_1.30 readxl_1.3.1 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 xfun_0.19 haven_2.3.1 snakecase_0.11.0
## [5] colorspace_2.0-0 vctrs_0.3.5 generics_0.1.0 htmltools_0.5.0
## [9] yaml_2.2.1 utf8_1.1.4 rlang_0.4.9 pillar_1.4.7
## [13] withr_2.3.0 glue_1.4.2 DBI_1.1.0 dbplyr_2.0.0
## [17] modelr_0.1.8 lifecycle_0.2.0 munsell_0.5.0 gtable_0.3.0
## [21] cellranger_1.1.0 rvest_0.3.6 evaluate_0.14 parallel_4.0.3
## [25] fansi_0.4.1 broom_0.7.2 Rcpp_1.0.5 backports_1.2.1
## [29] jsonlite_1.7.2 fs_1.5.0 hms_0.5.3 digest_0.6.27
## [33] stringi_1.5.3 bookdown_0.21 rprojroot_2.0.2 grid_4.0.3
## [37] cli_2.2.0 tools_4.0.3 magrittr_2.0.1 crayon_1.3.4
## [41] pkgconfig_2.0.3 ellipsis_0.3.1 xml2_1.3.2 reprex_0.3.0
## [45] assertthat_0.2.1 rmarkdown_2.6 httr_1.4.2 rstudioapi_0.13
## [49] R6_2.5.0 compiler_4.0.3