MB-CDI

Purpose

This page documents the cleaning and merging procedures related to the MB-CDI data. The home visit workflow strips these files into their own set of CSVs under data/csv/home_visit/mbcdi.

The aggregate (across language group and age) data files are saved under data/csv/agg.

Preparation

source(file.path(here::here(), "R", "_OLD", "functions.R"))

purrr::walk(list.files(file.path(here::here(), "R"), "\\.R$", full.names = TRUE), source)

Let’s investigate the number of files, records, and variables per file.

mbcdi_fns <-
  list.files(file.path(here::here(), "data", "csv", "home_visit", "mbcdi"), "\\.csv$", full.names = TRUE)

length(mbcdi_fns)

## [1] 25

make_datafile_summary <- function(csv) {
  assertthat::is.string(csv)
  assertthat::is.readable(csv)
  
  df <-
    readr::read_csv(csv,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
  
  
  
  data.frame(fn = csv,
             age_group = extract_age_group_from_name(csv),
             lang_cond = form_language(csv),
             n_subs = dim(df)[1],
             n_vars = dim(df)[2])
}

mbcdi_file_dat <-
  purrr::map(mbcdi_fns, make_datafile_summary) |> purrr::list_rbind()

mbcdi_file_dat |>
  dplyr::arrange(age_group, lang_cond, n_subs) |>
  knitr::kable(format = 'html') |>
  kableExtra::kable_classic()

fn	age_group	lang_cond	n_subs	n_vars
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/334099_mbcdi_12_bilingual_english.csv	12	bilingual_english	1	730
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411456_mbcdi_12_bilingual_english.csv	12	bilingual_english	1	717
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740623_mbcdi_12_bilingual_english.csv	12	bilingual_english	33	712
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411469_mbcdi_12_bilingual_spanish.csv	12	bilingual_spanish	1	717
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740624_mbcdi_12_bilingual_spanish.csv	12	bilingual_spanish	1	712
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/331848_mbcdi_12_english.csv	12	english	4	256
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363431_mbcdi_12_english.csv	12	english	10	255
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740625_mbcdi_12_english.csv	12	english	172	259
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363466_mbcdi_18_bilingual_english.csv	18	bilingual_english	0	815
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740626_mbcdi_18_bilingual_english.csv	18	bilingual_english	47	829
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/411388_mbcdi_18_bilingual_spanish.csv	18	bilingual_spanish	0	815
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740627_mbcdi_18_bilingual_spanish.csv	18	bilingual_spanish	5	829
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/1136694_mbcdi_18_english.csv	18	english	0	359
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/1151489_mbcdi_18_english.csv	18	english	0	359
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/307736_mbcdi_18_english.csv	18	english	4	352
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363349_mbcdi_18_english.csv	18	english	9	352
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740628_mbcdi_18_english.csv	18	english	157	358
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363465_mbcdi_24_bilingual_english.csv	24	bilingual_english	0	815
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740631_mbcdi_24_bilingual_english.csv	24	bilingual_english	36	829
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/408149_mbcdi_24_bilingual_spanish.csv	24	bilingual_spanish	1	815
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740630_mbcdi_24_bilingual_spanish.csv	24	bilingual_spanish	2	829
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740631_mbcdi_24_bilingual_spanish.csv	24	bilingual_spanish	26	830
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/331453_mbcdi_24_english.csv	24	english	3	352
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/363381_mbcdi_24_english.csv	24	english	8	352
/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/mbcdi/740629_mbcdi_24_english.csv	24	english	133	358

12-mo-old English speakers

eng_12_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '12', lang_cond == 'english', n_subs > 0)

eng_12_combined_df <- purrr::map(eng_12_files$fn, mcdi_clean_12_csv) |>
  purrr::list_rbind()

eng_12_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_12_combined.csv")
readr::write_csv(eng_12_combined_df, eng_12_fn)

There are \(n=\) 186 participant records.

18-mo-old English speakers

This code should be wrapped in functions since many of the components duplicate one another.

eng_18_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '18', lang_cond == 'english', n_subs > 0)

eng_18_combined_df <- purrr::map(eng_18_files$fn, mcdi_clean_18_24_csv) |>
  purrr::list_rbind()

eng_18_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_18_combined.csv")
readr::write_csv(eng_18_combined_df, eng_18_fn)

There are \(n=\) 170 participant records.

24-mo-old English speakers

eng_24_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '24', lang_cond == 'english', n_subs > 0)

eng_24_combined_df <- purrr::map(eng_24_files$fn, mcdi_clean_18_24_csv) |>
  purrr::list_rbind()

eng_24_fn <- file.path(here::here(), "data", "csv", "home_visit", "agg", "mcdi_english_24_combined.csv")
readr::write_csv(eng_24_combined_df, eng_24_fn)

There are \(n=\) 170 participant records.

Old code

The following code is deprecated as of 2023-10-25, and is not run.

For simplicity, we’ll start with the youngest age group, and with the English speakers. There are \(n=3\) forms, with 4, 10, and 111 participants each, and 254, 253, and 257 variables.

eng_12_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '12', lang_cond == 'english', n_subs > 0)

We’ll examine the first one.

eng_12_331 <- readr::read_csv(eng_12_files$fn[1],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)

names(eng_12_331) |> head()

Let’s try trimming the metadata labels.

eng_12_331_trim_names <- 
  names(eng_12_331) |> basename()

eng_12_331_trim_names |> head()

That looks better. Let’s look at the second file.

eng_12_363 <- readr::read_csv(eng_12_files$fn[2],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)

names(eng_12_363) |> basename() |> head()

And the third one.

eng_12_740625 <- readr::read_csv(eng_12_files$fn[3],
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)

names(eng_12_740625) |> basename() |> head()

Now, we’ll create a function to clean the variable names.

select_basename <- function(csv_fn) {
  assertthat::is.string(csv_fn)
  assertthat::is.readable(csv_fn)
  
  df <- readr::read_csv(csv_fn,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
  
  names(df) <- basename(names(df))
  df
}

select_basename(eng_12_files$fn[3]) |> head()

Let’s trim unneeded fields. We’ll write several helper functions to do this.

trim_cdi_fields <- function(df) {
  df |>
    dplyr::select(-contains("note"),
                  -contains("instructions"),
                  -contains("comments"),
                  -contains("continue"),
                  -contains("vocab"),
                  -contains("mcdi"))
}

add_particip_index <- function(df) {
  df |> 
    dplyr::mutate(play_i = 1:dim(df)[1])
}

make_cdi_longer <- function(df) {
  n_vars <- dim(df)[2]
  df |>
    tidyr::pivot_longer(cols = 2:n_vars,
                        names_to = "word",
                        values_to = "understands_or_says") |>
    dplyr::filter(!is.na(understands_or_says)) |>
    dplyr::mutate(understands_or_says = stringr::str_replace(understands_or_says, "understands___", "says")) |>
    dplyr::mutate(
      understands_or_says = stringr::str_replace(understands_or_says, "understands_says", "says")) |>
    dplyr::mutate(word = stringr::str_replace(word, "mommy_001", "mommy")) |>
    dplyr::mutate(word = stringr::str_replace(word, "bath_001", "bath"))
}

Then we combine them into an omnibus function.

clean_cdi <- function(csv_fn) {
  select_basename(csv_fn) |>
    trim_cdi_fields() |>
    dplyr::rename("play_id" = "participant_id") |>
    make_cdi_longer() |>
    add_particip_index()
}

Now, we can run clean_cdi() across all three files.

eng_12 <- purrr::map(eng_12_files$fn, clean_cdi) |>
  purrr::list_rbind()

xtabs(~ word + understands_or_says, eng_12)

18-mo-old English speakers

Let’s move on to the 18-mo-old English speakers.

eng_18_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '18', lang_cond == 'english', n_subs > 0)

There are \(n=\) 3 files with participant data.

clean_cdi(eng_18_files$fn[1])
clean_cdi(eng_18_files$fn[2])
clean_cdi(eng_18_files$fn[3])

There are some duplicate entries for some words.

~~We need a strategy for reconciling these duplicates: candy, leg, rain, wet.~~

It’s not elegant, but I have one for modifying the duplicate names. See below.

24-mo-old English speakers

Let’s move on to the 24-mo-old English speakers.

eng_24_files <- mbcdi_file_dat |>
  dplyr::filter(age_group == '24', lang_cond == 'english', n_subs > 0)

There are \(n=\) 3 files with participant data.

Let’s see how the clean_cdi() works on one of these.

clean_cdi(eng_24_files$fn[1])

Once again, we have duplicates for several items: ‘candy’, ‘leg’, ‘rain’, ‘wet’.

It’s very hacky, but I think we might want to modify these item names until we figure out a better way to handle the duplicates.

modify_mcdi_dupes <- function(df, dupe = 'leg') {
  dup_index <- seq_along(df)[names(df) == dupe]
  for (i in 1:length(dup_index)) {
    this_dup <- dup_index[i]
    names(df)[this_dup] <- paste0(dupe, "_", i)
  }
  df
}

open_csv <- function(csv_fn) {
  assertthat::is.string(csv_fn)
  assertthat::is.readable(csv_fn)
  
  df <- readr::read_csv(csv_fn,
                    col_types = readr::cols(.default = 'c'),
                    show_col_types = FALSE)
}

trim_cdi_18_24_fields <- function(df) {
  df |>
    dplyr::select(-contains("note"),
                  -contains("instructions"),
                  -contains("comments"),
                  -contains("continue"),
                  -contains("vocab"),
                  -contains("mcdi"))
}

make_cdi_18_24_longer <- function(df) {
  n_vars <- dim(df)[2]
  df |>
    tidyr::pivot_longer(cols = 2:n_vars,
                        names_to = "word",
                        values_to = "knows")
 }

clean_cdi_18_24_dedupe <- function(csv_fn) {
  df <- open_csv(csv_fn)
  names(df) <- basename(names(df))
  
  df |>
    modify_mcdi_dupes(dupe = 'leg') |>
    modify_mcdi_dupes(dupe = 'candy') |>
    modify_mcdi_dupes(dupe = 'rain') |>
    modify_mcdi_dupes(dupe = 'wet') |>
    trim_cdi_18_24_fields() |>
    dplyr::rename("play_id" = "participant_id") |>
    make_cdi_18_24_longer()
}

eng_24 <- purrr::map(eng_24_files$fn, clean_cdi_18_24_dedupe) |>
  purrr::list_rbind()
xtabs(~ word + knows, eng_24)

Now, we can return to the 18-mo-old data to see if this works:

eng_18 <- purrr::map(eng_18_files$fn, clean_cdi_18_24_dedupe) |>
  purrr::list_rbind()
xtabs(~ word + knows, eng_18)

It does.

QA for merging

Post-visit notes