MB-CDI
Purpose
This page documents the cleaning and merging procedures related to the MB-CDI data.
The home visit workflow strips these files into their own set of CSVs under data/csv/home_visit/mbcdi.
The aggregate (across language group and age) data files are saved under data/csv/agg.
Preparation
source(file.path(here::here(), "R", "_OLD", "functions.R"))
purrr::walk(list.files(file.path(here::here(), "R"), "\\.R$", full.names = TRUE), source)Let’s investigate the number of files, records, and variables per file.
mbcdi_fns <-
  list.files(file.path(here::here(), "data", "csv", "home_visit", "mbcdi"), "\\.csv$", full.names = TRUE)
length(mbcdi_fns)## [1] 25
make_datafile_summary <- function(csv) {
  assertthat::is.string(csv)
  assertthat::is.readable(csv)
  
  df <-
    readr::read_csv(csv,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
  
  
  
  data.frame(fn = csv,
             age_group = extract_age_group_from_name(csv),
             lang_cond = form_language(csv),
             n_subs = dim(df)[1],
             n_vars = dim(df)[2])
}
mbcdi_file_dat <-
  purrr::map(mbcdi_fns, make_datafile_summary) |> purrr::list_rbind()
mbcdi_file_dat |>
  dplyr::arrange(age_group, lang_cond, n_subs) |>
  knitr::kable(format = 'html') |>
  kableExtra::kable_classic()| fn | age_group | lang_cond | n_subs | n_vars | 
|---|---|---|---|---|
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/334099_mbcdi_12_bilingual_english.csv | 12 | bilingual_english | 1 | 730 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411456_mbcdi_12_bilingual_english.csv | 12 | bilingual_english | 1 | 717 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740623_mbcdi_12_bilingual_english.csv | 12 | bilingual_english | 33 | 712 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411469_mbcdi_12_bilingual_spanish.csv | 12 | bilingual_spanish | 1 | 717 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740624_mbcdi_12_bilingual_spanish.csv | 12 | bilingual_spanish | 1 | 712 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/331848_mbcdi_12_english.csv | 12 | english | 4 | 256 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363431_mbcdi_12_english.csv | 12 | english | 10 | 255 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740625_mbcdi_12_english.csv | 12 | english | 172 | 259 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363466_mbcdi_18_bilingual_english.csv | 18 | bilingual_english | 0 | 815 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740626_mbcdi_18_bilingual_english.csv | 18 | bilingual_english | 47 | 829 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411388_mbcdi_18_bilingual_spanish.csv | 18 | bilingual_spanish | 0 | 815 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740627_mbcdi_18_bilingual_spanish.csv | 18 | bilingual_spanish | 5 | 829 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/1136694_mbcdi_18_english.csv | 18 | english | 0 | 359 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/1151489_mbcdi_18_english.csv | 18 | english | 0 | 359 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/307736_mbcdi_18_english.csv | 18 | english | 4 | 352 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363349_mbcdi_18_english.csv | 18 | english | 9 | 352 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740628_mbcdi_18_english.csv | 18 | english | 157 | 358 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363465_mbcdi_24_bilingual_english.csv | 24 | bilingual_english | 0 | 815 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740631_mbcdi_24_bilingual_english.csv | 24 | bilingual_english | 36 | 829 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/408149_mbcdi_24_bilingual_spanish.csv | 24 | bilingual_spanish | 1 | 815 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740630_mbcdi_24_bilingual_spanish.csv | 24 | bilingual_spanish | 2 | 829 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740631_mbcdi_24_bilingual_spanish.csv | 24 | bilingual_spanish | 26 | 830 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/331453_mbcdi_24_english.csv | 24 | english | 3 | 352 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363381_mbcdi_24_english.csv | 24 | english | 8 | 352 | 
| /Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740629_mbcdi_24_english.csv | 24 | english | 133 | 358 | 
12-mo-old English speakers
eng_12_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '12', lang_cond == 'english', n_subs > 0)
eng_12_combined_df <- purrr::map(eng_12_files$fn, mcdi_clean_12_csv) |>
  purrr::list_rbind()
eng_12_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_12_combined.csv")
readr::write_csv(eng_12_combined_df, eng_12_fn)There are \(n=\) 186 participant records.
18-mo-old English speakers
This code should be wrapped in functions since many of the components duplicate one another.
eng_18_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '18', lang_cond == 'english', n_subs > 0)
eng_18_combined_df <- purrr::map(eng_18_files$fn, mcdi_clean_18_24_csv) |>
  purrr::list_rbind()
eng_18_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_18_combined.csv")
readr::write_csv(eng_18_combined_df, eng_18_fn)There are \(n=\) 170 participant records.
24-mo-old English speakers
eng_24_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '24', lang_cond == 'english', n_subs > 0)
eng_24_combined_df <- purrr::map(eng_24_files$fn, mcdi_clean_18_24_csv) |>
  purrr::list_rbind()
eng_24_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_24_combined.csv")
readr::write_csv(eng_24_combined_df, eng_24_fn)There are \(n=\) 170 participant records.
Old code
The following code is deprecated as of 2023-10-25, and is not run.
For simplicity, we’ll start with the youngest age group, and with the English speakers. There are \(n=3\) forms, with 4, 10, and 111 participants each, and 254, 253, and 257 variables.
eng_12_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '12', lang_cond == 'english', n_subs > 0)We’ll examine the first one.
eng_12_331 <- readr::read_csv(eng_12_files$fn[1],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
names(eng_12_331) |> head()Let’s try trimming the metadata labels.
That looks better. Let’s look at the second file.
eng_12_363 <- readr::read_csv(eng_12_files$fn[2],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
names(eng_12_363) |> basename() |> head()And the third one.
eng_12_740625 <- readr::read_csv(eng_12_files$fn[3],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
names(eng_12_740625) |> basename() |> head()Now, we’ll create a function to clean the variable names.
select_basename <- function(csv_fn) {
  assertthat::is.string(csv_fn)
  assertthat::is.readable(csv_fn)
  
  df <- readr::read_csv(csv_fn,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
  
  names(df) <- basename(names(df))
  df
}
select_basename(eng_12_files$fn[3]) |> head()Let’s trim unneeded fields. We’ll write several helper functions to do this.
trim_cdi_fields <- function(df) {
  df |>
    dplyr::select(-contains("note"),
                  -contains("instructions"),
                  -contains("comments"),
                  -contains("continue"),
                  -contains("vocab"),
                  -contains("mcdi"))
}
add_particip_index <- function(df) {
  df |> 
    dplyr::mutate(play_i = 1:dim(df)[1])
}
make_cdi_longer <- function(df) {
  n_vars <- dim(df)[2]
  df |>
    tidyr::pivot_longer(cols = 2:n_vars,
                        names_to = "word",
                        values_to = "understands_or_says") |>
    dplyr::filter(!is.na(understands_or_says)) |>
    dplyr::mutate(understands_or_says = stringr::str_replace(understands_or_says, "understands___", "says")) |>
    dplyr::mutate(
      understands_or_says = stringr::str_replace(understands_or_says, "understands_says", "says")) |>
    dplyr::mutate(word = stringr::str_replace(word, "mommy_001", "mommy")) |>
    dplyr::mutate(word = stringr::str_replace(word, "bath_001", "bath"))
}Then we combine them into an omnibus function.
clean_cdi <- function(csv_fn) {
  select_basename(csv_fn) |>
    trim_cdi_fields() |>
    dplyr::rename("play_id" = "participant_id") |>
    make_cdi_longer() |>
    add_particip_index()
}Now, we can run clean_cdi() across all three files.
eng_12 <- purrr::map(eng_12_files$fn, clean_cdi) |>
  purrr::list_rbind()
xtabs(~ word + understands_or_says, eng_12)18-mo-old English speakers
Let’s move on to the 18-mo-old English speakers.
eng_18_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '18', lang_cond == 'english', n_subs > 0)There are \(n=\) 3 files with participant data.
clean_cdi(eng_18_files$fn[1])
clean_cdi(eng_18_files$fn[2])
clean_cdi(eng_18_files$fn[3])There are some duplicate entries for some words.
We need a strategy for reconciling these duplicates: candy, leg, rain, wet.
It’s not elegant, but I have one for modifying the duplicate names. See below.
24-mo-old English speakers
Let’s move on to the 24-mo-old English speakers.
eng_24_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '24', lang_cond == 'english', n_subs > 0)There are \(n=\) 3 files with participant data.
Let’s see how the clean_cdi() works on one of these.
clean_cdi(eng_24_files$fn[1])Once again, we have duplicates for several items: ‘candy’, ‘leg’, ‘rain’, ‘wet’.
It’s very hacky, but I think we might want to modify these item names until we figure out a better way to handle the duplicates.
modify_mcdi_dupes <- function(df, dupe = 'leg') {
  dup_index <- seq_along(df)[names(df) == dupe]
  for (i in 1:length(dup_index)) {
    this_dup <- dup_index[i]
    names(df)[this_dup] <- paste0(dupe, "_", i)
  }
  df
}
open_csv <- function(csv_fn) {
  assertthat::is.string(csv_fn)
  assertthat::is.readable(csv_fn)
  
  df <- readr::read_csv(csv_fn,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
}
trim_cdi_18_24_fields <- function(df) {
  df |>
    dplyr::select(-contains("note"),
                  -contains("instructions"),
                  -contains("comments"),
                  -contains("continue"),
                  -contains("vocab"),
                  -contains("mcdi"))
}
make_cdi_18_24_longer <- function(df) {
  n_vars <- dim(df)[2]
  df |>
    tidyr::pivot_longer(cols = 2:n_vars,
                        names_to = "word",
                        values_to = "knows")
 }
clean_cdi_18_24_dedupe <- function(csv_fn) {
  df <- open_csv(csv_fn)
  names(df) <- basename(names(df))
  
  df |>
    modify_mcdi_dupes(dupe = 'leg') |>
    modify_mcdi_dupes(dupe = 'candy') |>
    modify_mcdi_dupes(dupe = 'rain') |>
    modify_mcdi_dupes(dupe = 'wet') |>
    trim_cdi_18_24_fields() |>
    dplyr::rename("play_id" = "participant_id") |>
    make_cdi_18_24_longer()
}
eng_24 <- purrr::map(eng_24_files$fn, clean_cdi_18_24_dedupe) |>
  purrr::list_rbind()
xtabs(~ word + knows, eng_24)Now, we can return to the 18-mo-old data to see if this works:
eng_18 <- purrr::map(eng_18_files$fn, clean_cdi_18_24_dedupe) |>
  purrr::list_rbind()
xtabs(~ word + knows, eng_18)It does.