MB-CDI
Purpose
This page documents the cleaning and merging procedures related to the MB-CDI data.
The home visit workflow strips these files into their own set of CSVs under data/csv/home_visit/mbcdi
.
The aggregate (across language group and age) data files are saved under data/csv/agg
.
Preparation
source(file.path(here::here(), "R", "_OLD", "functions.R"))
purrr::walk(list.files(file.path(here::here(), "R"), "\\.R$", full.names = TRUE), source)
Let’s investigate the number of files, records, and variables per file.
mbcdi_fns <-
list.files(file.path(here::here(), "data", "csv", "home_visit", "mbcdi"), "\\.csv$", full.names = TRUE)
length(mbcdi_fns)
## [1] 25
make_datafile_summary <- function(csv) {
assertthat::is.string(csv)
assertthat::is.readable(csv)
df <-
readr::read_csv(csv,
col_types = readr::cols(.default = 'c'),
show_col_types = FALSE)
data.frame(fn = csv,
age_group = extract_age_group_from_name(csv),
lang_cond = form_language(csv),
n_subs = dim(df)[1],
n_vars = dim(df)[2])
}
mbcdi_file_dat <-
purrr::map(mbcdi_fns, make_datafile_summary) |> purrr::list_rbind()
mbcdi_file_dat |>
dplyr::arrange(age_group, lang_cond, n_subs) |>
knitr::kable(format = 'html') |>
kableExtra::kable_classic()
fn | age_group | lang_cond | n_subs | n_vars |
---|---|---|---|---|
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/334099_mbcdi_12_bilingual_english.csv | 12 | bilingual_english | 1 | 730 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411456_mbcdi_12_bilingual_english.csv | 12 | bilingual_english | 1 | 717 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740623_mbcdi_12_bilingual_english.csv | 12 | bilingual_english | 33 | 712 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411469_mbcdi_12_bilingual_spanish.csv | 12 | bilingual_spanish | 1 | 717 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740624_mbcdi_12_bilingual_spanish.csv | 12 | bilingual_spanish | 1 | 712 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/331848_mbcdi_12_english.csv | 12 | english | 4 | 256 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363431_mbcdi_12_english.csv | 12 | english | 10 | 255 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740625_mbcdi_12_english.csv | 12 | english | 172 | 259 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363466_mbcdi_18_bilingual_english.csv | 18 | bilingual_english | 0 | 815 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740626_mbcdi_18_bilingual_english.csv | 18 | bilingual_english | 47 | 829 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411388_mbcdi_18_bilingual_spanish.csv | 18 | bilingual_spanish | 0 | 815 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740627_mbcdi_18_bilingual_spanish.csv | 18 | bilingual_spanish | 5 | 829 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/1136694_mbcdi_18_english.csv | 18 | english | 0 | 359 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/1151489_mbcdi_18_english.csv | 18 | english | 0 | 359 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/307736_mbcdi_18_english.csv | 18 | english | 4 | 352 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363349_mbcdi_18_english.csv | 18 | english | 9 | 352 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740628_mbcdi_18_english.csv | 18 | english | 157 | 358 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363465_mbcdi_24_bilingual_english.csv | 24 | bilingual_english | 0 | 815 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740631_mbcdi_24_bilingual_english.csv | 24 | bilingual_english | 36 | 829 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/408149_mbcdi_24_bilingual_spanish.csv | 24 | bilingual_spanish | 1 | 815 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740630_mbcdi_24_bilingual_spanish.csv | 24 | bilingual_spanish | 2 | 829 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740631_mbcdi_24_bilingual_spanish.csv | 24 | bilingual_spanish | 26 | 830 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/331453_mbcdi_24_english.csv | 24 | english | 3 | 352 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363381_mbcdi_24_english.csv | 24 | english | 8 | 352 |
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740629_mbcdi_24_english.csv | 24 | english | 133 | 358 |
12-mo-old English speakers
eng_12_files <- mbcdi_file_dat |>
dplyr::filter(age_group == '12', lang_cond == 'english', n_subs > 0)
eng_12_combined_df <- purrr::map(eng_12_files$fn, mcdi_clean_12_csv) |>
purrr::list_rbind()
eng_12_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_12_combined.csv")
readr::write_csv(eng_12_combined_df, eng_12_fn)
There are \(n=\) 186 participant records.
18-mo-old English speakers
This code should be wrapped in functions since many of the components duplicate one another.
eng_18_files <- mbcdi_file_dat |>
dplyr::filter(age_group == '18', lang_cond == 'english', n_subs > 0)
eng_18_combined_df <- purrr::map(eng_18_files$fn, mcdi_clean_18_24_csv) |>
purrr::list_rbind()
eng_18_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_18_combined.csv")
readr::write_csv(eng_18_combined_df, eng_18_fn)
There are \(n=\) 170 participant records.
24-mo-old English speakers
eng_24_files <- mbcdi_file_dat |>
dplyr::filter(age_group == '24', lang_cond == 'english', n_subs > 0)
eng_24_combined_df <- purrr::map(eng_24_files$fn, mcdi_clean_18_24_csv) |>
purrr::list_rbind()
eng_24_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_24_combined.csv")
readr::write_csv(eng_24_combined_df, eng_24_fn)
There are \(n=\) 170 participant records.
Old code
The following code is deprecated as of 2023-10-25, and is not run.
For simplicity, we’ll start with the youngest age group, and with the English speakers. There are \(n=3\) forms, with 4, 10, and 111 participants each, and 254, 253, and 257 variables.
eng_12_files <- mbcdi_file_dat |>
dplyr::filter(age_group == '12', lang_cond == 'english', n_subs > 0)
We’ll examine the first one.
eng_12_331 <- readr::read_csv(eng_12_files$fn[1],
col_types = readr::cols(.default = 'c'),
show_col_types = FALSE)
names(eng_12_331) |> head()
Let’s try trimming the metadata labels.
That looks better. Let’s look at the second file.
eng_12_363 <- readr::read_csv(eng_12_files$fn[2],
col_types = readr::cols(.default = 'c'),
show_col_types = FALSE)
names(eng_12_363) |> basename() |> head()
And the third one.
eng_12_740625 <- readr::read_csv(eng_12_files$fn[3],
col_types = readr::cols(.default = 'c'),
show_col_types = FALSE)
names(eng_12_740625) |> basename() |> head()
Now, we’ll create a function to clean the variable names.
select_basename <- function(csv_fn) {
assertthat::is.string(csv_fn)
assertthat::is.readable(csv_fn)
df <- readr::read_csv(csv_fn,
col_types = readr::cols(.default = 'c'),
show_col_types = FALSE)
names(df) <- basename(names(df))
df
}
select_basename(eng_12_files$fn[3]) |> head()
Let’s trim unneeded fields. We’ll write several helper functions to do this.
trim_cdi_fields <- function(df) {
df |>
dplyr::select(-contains("note"),
-contains("instructions"),
-contains("comments"),
-contains("continue"),
-contains("vocab"),
-contains("mcdi"))
}
add_particip_index <- function(df) {
df |>
dplyr::mutate(play_i = 1:dim(df)[1])
}
make_cdi_longer <- function(df) {
n_vars <- dim(df)[2]
df |>
tidyr::pivot_longer(cols = 2:n_vars,
names_to = "word",
values_to = "understands_or_says") |>
dplyr::filter(!is.na(understands_or_says)) |>
dplyr::mutate(understands_or_says = stringr::str_replace(understands_or_says, "understands___", "says")) |>
dplyr::mutate(
understands_or_says = stringr::str_replace(understands_or_says, "understands_says", "says")) |>
dplyr::mutate(word = stringr::str_replace(word, "mommy_001", "mommy")) |>
dplyr::mutate(word = stringr::str_replace(word, "bath_001", "bath"))
}
Then we combine them into an omnibus function.
clean_cdi <- function(csv_fn) {
select_basename(csv_fn) |>
trim_cdi_fields() |>
dplyr::rename("play_id" = "participant_id") |>
make_cdi_longer() |>
add_particip_index()
}
Now, we can run clean_cdi()
across all three files.
eng_12 <- purrr::map(eng_12_files$fn, clean_cdi) |>
purrr::list_rbind()
xtabs(~ word + understands_or_says, eng_12)
18-mo-old English speakers
Let’s move on to the 18-mo-old English speakers.
eng_18_files <- mbcdi_file_dat |>
dplyr::filter(age_group == '18', lang_cond == 'english', n_subs > 0)
There are \(n=\) 3 files with participant data.
clean_cdi(eng_18_files$fn[1])
clean_cdi(eng_18_files$fn[2])
clean_cdi(eng_18_files$fn[3])
There are some duplicate entries for some words.
We need a strategy for reconciling these duplicates: candy
, leg
, rain
, wet
.
It’s not elegant, but I have one for modifying the duplicate names. See below.
24-mo-old English speakers
Let’s move on to the 24-mo-old English speakers.
eng_24_files <- mbcdi_file_dat |>
dplyr::filter(age_group == '24', lang_cond == 'english', n_subs > 0)
There are \(n=\) 3 files with participant data.
Let’s see how the clean_cdi()
works on one of these.
clean_cdi(eng_24_files$fn[1])
Once again, we have duplicates for several items: ‘candy’, ‘leg’, ‘rain’, ‘wet’.
It’s very hacky, but I think we might want to modify these item names until we figure out a better way to handle the duplicates.
modify_mcdi_dupes <- function(df, dupe = 'leg') {
dup_index <- seq_along(df)[names(df) == dupe]
for (i in 1:length(dup_index)) {
this_dup <- dup_index[i]
names(df)[this_dup] <- paste0(dupe, "_", i)
}
df
}
open_csv <- function(csv_fn) {
assertthat::is.string(csv_fn)
assertthat::is.readable(csv_fn)
df <- readr::read_csv(csv_fn,
col_types = readr::cols(.default = 'c'),
show_col_types = FALSE)
}
trim_cdi_18_24_fields <- function(df) {
df |>
dplyr::select(-contains("note"),
-contains("instructions"),
-contains("comments"),
-contains("continue"),
-contains("vocab"),
-contains("mcdi"))
}
make_cdi_18_24_longer <- function(df) {
n_vars <- dim(df)[2]
df |>
tidyr::pivot_longer(cols = 2:n_vars,
names_to = "word",
values_to = "knows")
}
clean_cdi_18_24_dedupe <- function(csv_fn) {
df <- open_csv(csv_fn)
names(df) <- basename(names(df))
df |>
modify_mcdi_dupes(dupe = 'leg') |>
modify_mcdi_dupes(dupe = 'candy') |>
modify_mcdi_dupes(dupe = 'rain') |>
modify_mcdi_dupes(dupe = 'wet') |>
trim_cdi_18_24_fields() |>
dplyr::rename("play_id" = "participant_id") |>
make_cdi_18_24_longer()
}
eng_24 <- purrr::map(eng_24_files$fn, clean_cdi_18_24_dedupe) |>
purrr::list_rbind()
xtabs(~ word + knows, eng_24)
Now, we can return to the 18-mo-old data to see if this works:
eng_18 <- purrr::map(eng_18_files$fn, clean_cdi_18_24_dedupe) |>
purrr::list_rbind()
xtabs(~ word + knows, eng_18)
It does.