Modified

September 17, 2024

About

This page documents the process for combining separate files into aggregate files containing data from many participants.

Setup

Code
fl <-
  list.files(
    file.path(here::here(), "R"),
    "^load|^kobo_|^file_|^screen_|^ecbq_|^health_|^databrary|^home|^make|^export|^post_visit|CONSTANTS|utils",
    full.names = TRUE
  )
purrr::walk(fl, source)

suppressPackageStartupMessages(library(tidyverse))

Remove identifiers

The non-MBCDI file contains the identifiers, so that is the target of this removal process.

Note that we have added data to .gitignore in protocol/, the root directory for the HTML protocol, so none of the data files should be made available via git or GitHub. This also means that there is no version control being done on raw data files themselves.

tar_target(
  home_visit_remove_identifiers,
  purrr::map_chr(
    home_visit_non_mbcdi,
    open_deidentify_save,
    csv_save_dir = "data/csv/home_visit/non_mbcdi/deid",
      these_questions = 'non_mbcdi'
  )
)
Code
home_visit_non_mbcdi <- list.files(file.path(here::here(), "data/csv/home_visit/non_mbcdi/raw"),
                 "\\.csv$", full.names = TRUE)
purrr::map_chr(
    home_visit_non_mbcdi,
    file_open_deidentify_save,
    csv_save_dir = file.path(here::here(), "data/csv/home_visit/non_mbcdi/deid"),
      these_questions = 'non_mbcdi'
  )
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/1136694_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/1151489_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/307736_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/331453_non_mbcdi_24_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/331848_non_mbcdi_12_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/334099_non_mbcdi_12_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363349_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363381_non_mbcdi_24_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363431_non_mbcdi_12_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363465_non_mbcdi_24_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363466_non_mbcdi_18_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/408149_non_mbcdi_24_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411388_non_mbcdi_18_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411456_non_mbcdi_12_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411469_non_mbcdi_12_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740623_non_mbcdi_12_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740624_non_mbcdi_12_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740625_non_mbcdi_12_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740626_non_mbcdi_18_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740627_non_mbcdi_18_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740628_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740629_non_mbcdi_24_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740630_non_mbcdi_24_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740631_non_mbcdi_24_bilingual_english_deidentified.csv`
 [1] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/1136694_non_mbcdi_18_english_deidentified.csv"         
 [2] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/1151489_non_mbcdi_18_english_deidentified.csv"         
 [3] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/307736_non_mbcdi_18_english_deidentified.csv"          
 [4] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/331453_non_mbcdi_24_english_deidentified.csv"          
 [5] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/331848_non_mbcdi_12_english_deidentified.csv"          
 [6] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/334099_non_mbcdi_12_bilingual_english_deidentified.csv"
 [7] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363349_non_mbcdi_18_english_deidentified.csv"          
 [8] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363381_non_mbcdi_24_english_deidentified.csv"          
 [9] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363431_non_mbcdi_12_english_deidentified.csv"          
[10] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363465_non_mbcdi_24_bilingual_english_deidentified.csv"
[11] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363466_non_mbcdi_18_bilingual_english_deidentified.csv"
[12] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/408149_non_mbcdi_24_bilingual_spanish_deidentified.csv"
[13] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411388_non_mbcdi_18_bilingual_spanish_deidentified.csv"
[14] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411456_non_mbcdi_12_bilingual_english_deidentified.csv"
[15] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411469_non_mbcdi_12_bilingual_spanish_deidentified.csv"
[16] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740623_non_mbcdi_12_bilingual_english_deidentified.csv"
[17] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740624_non_mbcdi_12_bilingual_spanish_deidentified.csv"
[18] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740625_non_mbcdi_12_english_deidentified.csv"          
[19] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740626_non_mbcdi_18_bilingual_english_deidentified.csv"
[20] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740627_non_mbcdi_18_bilingual_spanish_deidentified.csv"
[21] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740628_non_mbcdi_18_english_deidentified.csv"          
[22] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740629_non_mbcdi_24_english_deidentified.csv"          
[23] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740630_non_mbcdi_24_bilingual_spanish_deidentified.csv"
[24] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740631_non_mbcdi_24_bilingual_english_deidentified.csv"

Quality assurance (QA) reviews

MB-CDI files

Note

To be completed.

Non-MB-CDI files

Create a helper function to create a data set with summary information about the data files.

Code
summarize_non_mbcdi_qs <- function(fn) {
  stopifnot(is.character(fn))
  
  if (!file.exists(fn)) {
    stop('File not found `', fn, '`')
  } else {
    df <- readr::read_csv(fn, show_col_types = FALSE)
    if (!is.data.frame(df)) {
      stop('Error reading data frame')
    } else {
      out_df <-
        tibble(
          file_name = basename(fn),
          n_rows = dim(df)[1],
          n_vars = dim(df)[2]
        )
      dplyr::arrange(out_df, file_name)
    }
  }
}

Select the de-identified CSVs to examine.

Code
fl <-
  list.files(
    file.path(here::here(), "data/csv/home_visit/non_mbcdi/deid"),
    '^[0-9]+_non_mbcdi_[12|18|24].*deidentified',
    full.names = TRUE
  )

PLAY_forms <- purrr::map_df(fl, summarize_non_mbcdi_qs)

PLAY_forms %>%
  knitr::kable(., format = 'html') %>%
  kableExtra::kable_classic()
file_name n_rows n_vars
1136694_non_mbcdi_18_english_deidentified.csv 0 288
1151489_non_mbcdi_18_english_deidentified.csv 0 286
307736_non_mbcdi_18_english_deidentified.csv 4 274
331453_non_mbcdi_24_english_deidentified.csv 3 274
331848_non_mbcdi_12_english_deidentified.csv 4 267
334099_non_mbcdi_12_bilingual_english_deidentified.csv 1 267
363349_non_mbcdi_18_english_deidentified.csv 9 280
363381_non_mbcdi_24_english_deidentified.csv 8 280
363431_non_mbcdi_12_english_deidentified.csv 10 281
363465_non_mbcdi_24_bilingual_english_deidentified.csv 0 280
363466_non_mbcdi_18_bilingual_english_deidentified.csv 0 280
408149_non_mbcdi_24_bilingual_spanish_deidentified.csv 1 280
411388_non_mbcdi_18_bilingual_spanish_deidentified.csv 0 280
411456_non_mbcdi_12_bilingual_english_deidentified.csv 1 280
411469_non_mbcdi_12_bilingual_spanish_deidentified.csv 1 280
740623_non_mbcdi_12_bilingual_english_deidentified.csv 49 288
740624_non_mbcdi_12_bilingual_spanish_deidentified.csv 5 288
740625_non_mbcdi_12_english_deidentified.csv 251 288
740626_non_mbcdi_18_bilingual_english_deidentified.csv 63 287
740627_non_mbcdi_18_bilingual_spanish_deidentified.csv 8 287
740628_non_mbcdi_18_english_deidentified.csv 234 288
740629_non_mbcdi_24_english_deidentified.csv 195 287
740630_non_mbcdi_24_bilingual_spanish_deidentified.csv 4 287
740631_non_mbcdi_24_bilingual_english_deidentified.csv 49 287

The later forms (with higher form numbers–the leading integers in the file names) are the newer ones. These generally have the largest number of entries and have similar numbers of columns–either 287 or 288. Accordingly, we focus our cleaning efforts here first.

We start with the data files that have \(n=288\) columns.

Code
df740623 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740623_non_mbcdi_12_bilingual_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

df740624 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740624_non_mbcdi_12_bilingual_spanish_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740624))
[1] 288
Code
df740625 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740625_non_mbcdi_12_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740625))
[1] 288
Code
df740628 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740628_non_mbcdi_18_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740628))
[1] 288

So, four of the most recent data files with \(n=288\) columns can be aggregated without modification.

Let’s turn to the more recent files with \(n=287\) columns.

Code
df740626 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740626_non_mbcdi_18_bilingual_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

df740627 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740627_non_mbcdi_18_bilingual_spanish_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740627))
[1] 100

Where does the misalignment arise?

Code
names(df740626) == names(df740627)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

The misalignment arises somewhere near column 92.

Code
df740629 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740629_non_mbcdi_24_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740629))
[1] 287

So, df740626 and df740629 are aligned and can be merged.

Code
df740630 <-
  readr::read_csv(
    file.path(here::here(), "data/csv/home_visit/non_mbcdi/deid/740630_non_mbcdi_24_bilingual_spanish_deidentified.csv"),
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740630))
[1] 100
Code
names(df740626) == names(df740630)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

These files also fall out of alignment near column 92.

Code
df740631 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740631_non_mbcdi_24_bilingual_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740631))
[1] 100
Code
names(df740626) == names(df740631)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

And these files fall out of alignment near column 92.

Let’s see if df740627, df740630, and df740631 are aligned with one another.

Code
sum(names(df740627) == names(df740630))
[1] 287
Code
sum(names(df740627) == names(df740631))
[1] 287

Yes, they are. So, these three can be merged. We do that first, then address the discrepancies between aggregates.

‘Older’ forms

The “older” forms have varied numbers of columns. We focus on thos with data (n_vars > 0)

Code
df307736 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/307736_non_mbcdi_18_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

df331453 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/331453_non_mbcdi_24_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df331848 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/331848_non_mbcdi_12_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df334099 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/334099_non_mbcdi_12_bilingual_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

df363349 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/363349_non_mbcdi_18_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df363381 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/363381_non_mbcdi_24_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df363431 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/363431_non_mbcdi_12_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df408149 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/408149_non_mbcdi_24_bilingual_spanish_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df411456 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/411456_non_mbcdi_12_bilingual_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df411469 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/411469_non_mbcdi_12_bilingual_spanish_deidentified.csv"
    ),
    show_col_types = FALSE
  )

Let’s look at the two forms that have the same number of columns, \(n=274\), 307736 and 331453.

Code
names(df307736) == names(df331453)
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[256] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[271] TRUE TRUE TRUE TRUE
Code
length(names(df307736) == names(df331453)) == length(names(df307736))
[1] TRUE

So, these two are identical and could be merged.

Code
hv_deid_fl <- list.files(file.path(here::here(), "data/csv/home_visit/non_mbcdi/deid"), "\\.csv$", full.names = TRUE)

files_274_cols <- stringr::str_detect(hv_deid_fl, "/(307736|331453)")

df_merge_274_cols <- file_make_aggregate_from_csvs(hv_deid_fl[files_274_cols])

How about the files with \(n=267\) columns, 331848 and 334099?

Code
names(df331848) == names(df334099)
  [1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[265]  TRUE  TRUE  TRUE
Code
length(names(df331848) == names(df334099)) == length(names(df331848))
[1] TRUE
Code
names(df331848) |> head()
[1] "group_combinedquestionnaires/participant_id"             
[2] "start"                                                   
[3] "end"                                                     
[4] "group_combinedquestionnaires/note_fillthisoutbeforestudy"
[5] "group_combinedquestionnaires/site_id"                    
[6] "group_combinedquestionnaires/subject_number"             
Code
names(df334099) |> head()
[1] "group_jo84c13/participant_id"             
[2] "start"                                    
[3] "end"                                      
[4] "group_jo84c13/note_fillthisoutbeforestudy"
[5] "group_jo84c13/site_id"                    
[6] "group_jo84c13/subject_number"             

There is an odd difference in the group label, group_combinedquestionnaires vs. group_jo84c13.

Let’s try deleting the initial group labels and compare again.

Code
n1 <- names(df331848)
n2 <- names(df334099)

names(df331848) %>% stringr::str_remove("group_combinedquestionnaires/") |> head()
[1] "participant_id"              "start"                      
[3] "end"                         "note_fillthisoutbeforestudy"
[5] "site_id"                     "subject_number"             
Code
names(df334099) %>% stringr::str_remove("group_jo84c13/") |> head()
[1] "participant_id"              "start"                      
[3] "end"                         "note_fillthisoutbeforestudy"
[5] "site_id"                     "subject_number"             

That looks promising.

Code
names(df331848) %>% stringr::str_remove("group_combinedquestionnaires/") -> n1
names(df334099) %>% stringr::str_remove("group_jo84c13/") -> n2
n1 == n2
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
 [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[265]  TRUE  TRUE  TRUE
Code
cbind(n1[8:15], n2[8:15])
     [,1]                     [,2]                    
[1,] "test_date"              "test_date"             
[2,] "child_sex"              "child_birth_date"      
[3,] "age_group"              "child_sex"             
[4,] "language_child"         "age_group"             
[5,] "language_child/english" "language_child"        
[6,] "language_child/spanish" "language_child/english"
[7,] "language_instruction"   "language_child/spanish"
[8,] "acknowledge_site"       "language_instruction"  

n2 or df334099 has a child_birth_date field in position 9 that the other data frame does not have.

Code
n1 |> str_detect("child_birth_date") |> sum()
[1] 0

If we delete that variable, the data frames will no longer have the same number of columns. Let’s explore that anyway.

Code
n2_2 <- n2[-9]

n1 == n2_2
Warning in n1 == n2_2: longer object length is not a multiple of shorter object
length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE
 [37]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [49] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE

That helps a bit, but we diverge around column 29.

Code
cbind(n1[28:51], n2_2[28:51])
      [,1]                                                                                                
 [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested"              
 [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/birthhospital"
 [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/afterhome"    
 [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/no"           
 [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/refused"      
 [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/donotknow"    
 [7,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested"               
 [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/birthhospital" 
 [9,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/afterhome"     
[10,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/no"            
[11,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/refused"       
[12,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"     
[13,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/allergies"       
[14,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/ear_infection"   
[15,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/asthma"          
[16,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/respiratory"     
[17,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/gastrointestinal"
[18,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"              
[19,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                
[20,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                    
[21,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"           
[22,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                   
[23,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"                           
[24,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                       
      [,2]                                                                                                                                  
 [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested"                                                
 [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/yes__in_the_bi"                                 
 [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/yes__after_goi"                                 
 [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/no"                                             
 [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/refused"                                        
 [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/don_t_know"                                     
 [7,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested"                                                 
 [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/yes__in_the_bi"                                  
 [9,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/yes__after_goi"                                  
[10,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/no"                                              
[11,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/refused"                                         
[12,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/don_t_know"                                      
[13,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/child_allergies_infections_ill_header"
[14,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/allergies"                            
[15,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/ear_infection"                        
[16,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/asthma"                               
[17,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/respiratory"                          
[18,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/gastrointestinal"                     
[19,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"                                                
[20,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                                                  
[21,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                                                      
[22,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"                                             
[23,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                                                     
[24,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                                                         

These question labels looks very similar. There are just some minor changes in the variable names. n2_2 has an extra variable in column 40.

Code
n2_3 <- n2_2[-40]

Then, we can rename some of the columns in n2_3 using corresponding names from n1.

Code
n2_3 |> stringr::str_replace("yes__in_the_bi", "birthhospital") |> stringr::str_replace("yes__after_goi", "afterhome") |> stringr::str_replace("don_t_know", "donotknow") -> n2_4

n1 == n2_4
Warning in n1 == n2_4: longer object length is not a multiple of shorter object
length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
[229]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE
Code
cbind(n1[39:51], n2_4[39:51])
      [,1]                                                                                                
 [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"     
 [2,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/allergies"       
 [3,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/ear_infection"   
 [4,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/asthma"          
 [5,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/respiratory"     
 [6,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/gastrointestinal"
 [7,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"              
 [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                
 [9,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                    
[10,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"           
[11,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                   
[12,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"                           
[13,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                       
      [,2]                                                                                                             
 [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"                  
 [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/allergies"       
 [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/ear_infection"   
 [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/asthma"          
 [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/respiratory"     
 [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/gastrointestinal"
 [7,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"                           
 [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                             
 [9,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                                 
[10,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"                        
[11,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                                
[12,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                                    
[13,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"                                      

n1 has a group_medicalprof label from allergies through gastrointestinal; n2_4 has child_allergies_infections_ill for the same questions.

Code
n2_4 |> stringr::str_replace("child_allergies_infections_ill", "group_medicalprof") -> n2_5
n1 == n2_5
Warning in n1 == n2_5: longer object length is not a multiple of shorter object
length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
[229]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE
Code
cbind(n1[49:60], n2_5[49:60])
      [,1]                                                                             
 [1,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"
 [2,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"        
 [3,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"    
 [4,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"      
 [5,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_1"   
 [6,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_2"   
 [7,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_3"   
 [8,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now"       
 [9,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now_amount"
[10,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_house"         
[11,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_car"           
[12,] "group_homevisitquestionnaires/group_health/group_smoking/comments_smoking"      
      [,2]                                                                             
 [1,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"
 [2,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"    
 [3,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"      
 [4,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_1"   
 [5,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_2"   
 [6,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_3"   
 [7,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now"       
 [8,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now_amount"
 [9,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_house"         
[10,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_car"           
[11,] "group_homevisitquestionnaires/group_health/group_smoking/comments_smoking"      
[12,] "group_homevisitquestionnaires/group_health/group_drinking/pregnant_drinking"    

It looks like these could be reconciled by deleting prenatal_care from n1.

Code
n1_2 <- n1[-50]
n1_2 == n2_5
Warning in n1_2 == n2_5: longer object length is not a multiple of shorter
object length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE
Code
cbind(n1_2[64:75], n2_5[64:75])
      [,1]                                                                                                          
 [1,] "group_homevisitquestionnaires/group_health/group_drinking/comments_drinking"                                 
 [2,] "group_homevisitquestionnaires/group_health/group_phq4/note_phq4"                                             
 [3,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_nervous"                           
 [4,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_worrying"                          
 [5,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_littleinterest"                    
 [6,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_down"                              
 [7,] "group_homevisitquestionnaires/group_health/group_phq4/comments_phq4"                                         
 [8,] "group_homevisitquestionnaires/group_rothbart/group_rothbartinstructions/rothbart_instructions1"              
 [9,] "group_homevisitquestionnaires/group_rothbart/group_rothbartinstructions/rothbart_instructions2"              
[10,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_unfamiliarperson"
[11,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_troubletask"     
[12,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_companyofchild"  
      [,2]                                                                                                       
 [1,] "group_homevisitquestionnaires/group_health/group_drinking/comments_drinking"                              
 [2,] "group_homevisitquestionnaires/group_health/group_phq4/Experimenter_These_stions_are_about_you"            
 [3,] "group_homevisitquestionnaires/group_health/group_phq4/comments_phq4"                                      
 [4,] "group_homevisitquestionnaires/group_rothbart/group_dd2kz32/instructor_rothbart"                           
 [5,] "group_homevisitquestionnaires/group_rothbart/group_dd2kz32/instructions_rothbart2"                        
 [6,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/rothbart_questions_header"                
 [7,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_approached_by_a_ld_cling_to_a_parent"
 [8,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/While_having_trouble_get_easily_irritated"
 [9,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_a_familiar_chil_company_of_the_child"
[10,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_offered_a_choic_uickly_and_go_for_it"
[11,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/During_daily_or_even_eing_quietly_sung_to"
[12,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/While_playing_outdoo_and_excitement_of_it"

It looks like the phq4 is not in n2_5.

Let’s check.

Code
n2_5 |> stringr::str_detect("phq4") |> sum()
[1] 2

Yes, there are only two PHQ4-related questions in df334099.

Code
df334099 |> names() |> stringr::str_detect("phq4") |> sum()
[1] 2

This path of reconciliation does not appear fruitful.

Make aggregate files

non-MB-CDI files with \(n=288\) columns

Code
files_288_cols <- stringr::str_detect(hv_deid_fl, "2[3458]_non_mbcdi.*_deidentified\\.csv")

df_merge_288_cols <- file_make_aggregate_from_csvs(hv_deid_fl[files_288_cols])

non-MB-CDI files with \(n=287\) columns

Code
files_287_cols_1 <- stringr::str_detect(hv_deid_fl, "2[69]_non_mbcdi.*_deidentified\\.csv")

files_287_cols_2 <- stringr::str_detect(hv_deid_fl, "(740627|740630|740631)_non.*_deidentified\\.csv")

df_merge_287_cols_1 <- file_make_aggregate_from_csvs(hv_deid_fl[files_287_cols_1])

df_merge_287_cols_2 <- file_make_aggregate_from_csvs(hv_deid_fl[files_287_cols_2])

Examine groups with \(n=287\) cols

We focus on the starting column where the column names diverge, column 92.

Code
# targets::tar_load(df_merge_287_cols_1, store="../_targets")
# targets::tar_load(df_merge_287_cols_2, store="../_targets")
names(df_merge_287_cols_1)[92]
[1] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_feeding_nutrition.instructions_feeding"
Code
names(df_merge_287_cols_2)[92]
[1] "group_combinedquestionnaires.group_homevisitquestionnaires.group_locomotor_milestones.group_health.group_feeding_nutrition.instructions_feeding"

There is an erroneous group_locomotor_milestones. in the df_merge_287_cols_2 column name.

A bit of sleuthing determines that this group_locomotor_milestones. label is characteristic of columns 92 to 273.

Code
names(df_merge_287_cols_2)[92:273] |> stringr::str_detect(pattern = "group_locomotor_milestones")
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[181] TRUE TRUE

The following should fix this.

Code
old_names <- names(df_merge_287_cols_2)
new_names <- old_names
new_names[92:273] <-
  stringr::str_remove(new_names[92:273], "group_locomotor_milestones\\.")
names(df_merge_287_cols_2) <- new_names
Code
names(df_merge_287_cols_2) == names(df_merge_287_cols_1)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[217]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[229]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[253]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
[265]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
[277] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

We have a second problem with columns from 114 to 210.

Code
rbind(names(df_merge_287_cols_1)[113:115], names(df_merge_287_cols_2)[113:115])
     [,1]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_vision_tested.donotknow"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_vision_tested.donotknow"
     [,2]                                                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
     [,3]                                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"

One of the problems has to do with column 114. There is a question ending doctor_told_you in names(df_merge_287_cols_1) but not in names(df_merge_287_cols_2).

Code
names(df_merge_287_cols_1) |> stringr::str_detect(pattern = "doctor_told_you") |> sum()
[1] 1
Code
names(df_merge_287_cols_2) |> stringr::str_detect(pattern = "doctor_told_you") |> sum()
[1] 0

Deleting this question would create additional misalignments and further problems. We cannot proceed without further discussion with our team.

For now, let’s generate an array with all of the remaining differences in column names.

Code
names_differ <- (names(df_merge_287_cols_2) != names(df_merge_287_cols_1))
sum(names_differ)
[1] 103
Code
rbind(names(df_merge_287_cols_1)[names_differ], names(df_merge_287_cols_2)[names_differ])
     [,1]                                                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
     [,2]                                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
     [,3]                                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.asthma"       
     [,4]                                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.asthma"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.respiratory"
     [,5]                                                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.respiratory"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.gastrointestinal"
     [,6]                                                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.gastrointestinal"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_allergy_etc"                           
     [,7]                                                                                                               
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_allergy_etc"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_injury_times"  
     [,8]                                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_injury_times"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comment_injury"    
     [,9]                                                                                                                  
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comment_injury"         
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_general_health"
     [,10]                                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_general_health"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.instructions_prenatal"        
     [,11]                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.instructions_prenatal"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.prenatal_care"        
     [,12]                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.prenatal_care"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.comments_prenatal"
     [,13]                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.comments_prenatal"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.pregnant_smoking"  
     [,14]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.pregnant_smoking"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_1"
     [,15]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_1"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_2"
     [,16]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_2"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_3"
     [,17]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_3"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now"    
     [,18]                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now"       
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now_amount"
     [,19]                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now_amount"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_house"         
     [,20]                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_house"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_car"  
     [,21]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_car"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.comments_smoking"
     [,22]                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.comments_smoking"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.pregnant_drinking"
     [,23]                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.pregnant_drinking"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_1"
     [,24]                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_1"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_2"
     [,25]                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_2"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_3"
     [,26]                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_3"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.comments_drinking"   
     [,27]                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.comments_drinking"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.note_phq4"            
     [,28]                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.note_phq4"        
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.instructions_phq4"
     [,29]                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.instructions_phq4"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_nervous"     
     [,30]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_nervous" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_worrying"
     [,31]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_worrying"      
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_littleinterest"
     [,32]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_littleinterest"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_down"          
     [,33]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_down"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.comments_phq4"
     [,34]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.comments_phq4"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart1"
     [,35]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart1"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart2"
     [,36]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart2"                        
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.instructions_rothbart3"
     [,37]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.instructions_rothbart3"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_unfamiliarperson"
     [,38]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_unfamiliarperson"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_troubletask"     
     [,39]                                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_troubletask"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_companyofchild"
     [,40]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_companyofchild"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_choiceactivities"
     [,41]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_choiceactivities"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_quietlysung"     
     [,42]                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_quietlysung"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_playingoutdoors"
     [,43]                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_playingoutdoors"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_morethan10"     
     [,44]                                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_morethan10"       
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_respondingremarks"
     [,45]                                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_respondingremarks" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitedlovedadults"
     [,46]                                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitedlovedadults"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_fiddlehair"        
     [,47]                                                                                                                  
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_fiddlehair"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_roughrowdy"
     [,48]                                                                                                                    
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_roughrowdy"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedhugged"
     [,49]                                                                                                                           
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedhugged"       
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_involvednewactivity"
     [,50]                                                                                                                           
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_involvednewactivity"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tirequickly"        
     [,51]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tirequickly"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_callattention"
     [,52]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_callattention"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tags"         
     [,53]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tags"            
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_noisyenvironment"
     [,54]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_noisyenvironment"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_energy"          
     [,55]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_energy"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_vehicles"
     [,56]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_vehicles"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_active"  
     [,57]                                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_active"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_forbidden"
     [,58]                                                                                                                    
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_forbidden"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_sadlytearful"
     [,59]                                                                                                                    
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_sadlytearful"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_downblue"    
     [,60]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_downblue"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_runhouse"
     [,61]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_runhouse"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitingevent"
     [,62]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitingevent"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tempertantrum"
     [,63]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tempertantrum"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_waitpatiently"
     [,64]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_waitpatiently"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedsmile"  
     [,65]                                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedsmile"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_mold"       
     [,66]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_mold"         
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_interactadult"
     [,67]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_interactadult"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_careful"      
     [,68]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_careful"      
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_enternewplace"
     [,69]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_enternewplace"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_crymorethan3" 
     [,70]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_crymorethan3" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_easilysoothed"
     [,71]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_easilysoothed"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_busyother"    
     [,72]                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_busyother"      
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_differentpeople"
     [,73]                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_differentpeople"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.comments_rothbart"                               
     [,74]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.comments_rothbart"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.mediause_instructions1"
     [,75]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.mediause_instructions1"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology"       
     [,76]                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.tv"
     [,77]                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.tv" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.dvd"
     [,78]                                                                                               
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.dvd"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.computer"
     [,79]                                                                                               
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.computer"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.ipad"    
     [,80]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.ipad"           
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.educationalgame"
     [,81]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.educationalgame"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.videogame"      
     [,82]                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.videogame"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_tv"      
     [,83]                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_tv"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_how"             
     [,84]                                                                                           
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_how"              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_dvd"
     [,85]                                                                                           
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_dvd"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.dvd_how"             
     [,86]                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.dvd_how"                  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_computer"
     [,87]                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_computer"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.computer_how"             
     [,88]                                                                                            
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.computer_how"         
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_ipad"
     [,89]                                                                                            
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_ipad"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.ipad_how"             
     [,90]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.ipad_how"                    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_educational"
     [,91]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_educational"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.educational_how"             
     [,92]                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.educational_how"           
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_videogame"
     [,93]                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_videogame"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.videogame_how"             
     [,94]                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.videogame_how"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_hours_per_day"
     [,95]                                                                                            
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_hours_per_day"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.note_tv_hours_per_day"
     [,96]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.note_tv_hours_per_day"                    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.instructions_technology_use"
     [,97]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.instructions_technology_use"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.technology_use_scale"       
     [,98]                                                                                                  
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typicalday" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typical_day"
     [,99]                                                                                            
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel1"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel1"
     [,100]                                                                                           
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel2"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel2"
     [,101]                                                                                           
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel3"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel3"
     [,102]                                                                                            
[1,] "group_combinedquestionnaires.group_databrary.acknowledge_databrary"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.acknowledge_databrary"
     [,103]                                                                                       
[1,] "group_combinedquestionnaires.group_databrary.note_saveasdraft"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_saveasdraft"

Visual inspection suggests that these are similar with the following deviations:

  • As noted, df_merge_287_cols_1 has a column ending doctor_told_you that is not present in df_merge_287_cols_2.
  • df_merge_287_cols_2 has a column ending technology_use_scale that is not present in the df_merge_287_cols_1
  • There are a set of fields in group_databrary that do not align exactly. We will almost certainly delete these, so the misalignment is not a huge problem.

As an exploration, let’s see if we can reconcile these by deleting the non-aligning columns.

Code
df1 <- df_merge_287_cols_1
df2 <- df_merge_287_cols_2

df1 <- df1 %>%
  dplyr::select(., -contains('doctor_told_you'))

df2 <- df2 %>%
  dplyr::select(., -contains('technology_use_scale'))

old_names <- names(df2)
new_names <- old_names
new_names[92:273] <- stringr::str_remove(new_names[92:273], "group_locomotor_milestones\\.")
names(df2) <- new_names

names(df1) == names(df2)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[157]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[169]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[205]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[217]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[229]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[253]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
[265]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
[277] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

This looks promising.

Code
rbind(names(df1)[263], names(df2)[263])
     [,1]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typicalday" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typical_day"

This is easily fixed.

Code
names(df1)[263] <- names(df2)[263]
Code
rbind(names(df1)[273:275], names(df2)[273:275])
     [,1]                                                                                             
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel1"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel1"
     [,2]                                                                                             
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel2"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel2"
     [,3]                                                                                             
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel3"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel3"

The last misalignments relate to Databrary fields.

Code
df1 <- df1 %>%
  dplyr::select(., -contains('group_databrary'))

df2 <- df2 %>%
  dplyr::select(., -contains('group_databrary'))

names(df1) == names(df2)
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[256] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[271] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Success!

Combining the two groups of datasets

Now, let’s go back to the data frame with 288 cols and see if we can bring these into alignment.

Code
df3 <- df_merge_288_cols

df3 <- df3 %>%
  dplyr::select(., -contains('group_databrary'))

c(dim(df1), dim(df2), dim(df3))
[1] 258 281  61 281 539 283
Code
names(df1) == names(df3)
Warning in names(df1) == names(df3): longer object length is not a multiple of
shorter object length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Code
rbind(names(df1)[114:115], names(df3)[114:115])
     [,1]                                                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
     [,2]                                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"    

Once again, there appears to be a problem with the ‘doctor_told_you’ field. We’ll delete it to see if this fixes one of the problems.

Code
df3 <- df3 %>%
  dplyr::select(., -contains('doctor_told_you'))

names(df1) == names(df3)
Warning in names(df1) == names(df3): longer object length is not a multiple of
shorter object length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[157]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[169]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[205]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE FALSE FALSE FALSE FALSE

We still have misalignments at column 210.

Code
rbind(names(df1)[210:213], names(df3)[210:213])
     [,1]                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.meals"               
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.technology_use_scale"
     [,2]                                                                                              
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.playtime"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.meals"   
     [,3]                                                                                              
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.bedtime" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.playtime"
     [,4]                                                                                                    
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.transportation"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.bedtime"       

The ’technology_use_scale` exists in one but not the other.

Code
df3 <- df3 %>%
  dplyr::select(., -contains('technology_use_scale'))

rbind(dim(df1), dim(df3))
     [,1] [,2]
[1,]  258  281
[2,]  539  281
Code
names(df1) == names(df3)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[157]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[169]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[205]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[217]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[229]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[253]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
[265]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[277]  TRUE  TRUE  TRUE  TRUE  TRUE

Future versions of the workflow will need to handle this more elegantly.

Option 1: Fix the underlying forms.

Option 2: Add the ‘missing’ columns as NA in post-processing.

For now, I’m going to create functions that align these data frames. These are incorporated into R/utils.R so we do not source them again here.

Code
remove_technology_use_scale <- function(df) {
  dplyr::select(df, -contains('technology_use_scale'))
}

remove_doctor_told_you <- function(df) {
  dplyr::select(df, -contains('doctor_told_you'))
}

remove_databrary_fields <- function(df) {
  dplyr::select(df, -contains('group_databrary'))
}

reconcile_typicalday <- function(df) {
  names(df) <- stringr::str_replace_all(names(df), 'typicalday', 'typical_day')
  df
}

remove_permissive_locomotor_milestones_label <- function(df) {
  old_names <- names(df)
  new_names <- old_names
  contains_locomotor <-
    stringr::str_detect(new_names, pattern = "locomotor_milestones.*health|division|rothbart|mediause|pets|typical|acknowledge")
  new_names[contains_locomotor] <-
    stringr::str_remove(new_names[contains_locomotor], "group_locomotor_milestones\\.")
  names(df) <- new_names
  df
}

remove_X_meta_cols <- function(df) {
  dplyr::select(df, -contains("X_"), -contains("meta.instanceID"))
}

remove_redundant_group_labels <- function(df) {
  names(df) <- stringr::str_remove_all(names(df), 'group_homevisitquestionnaires\\.')
  names(df) <- stringr::str_remove_all(names(df), 'group_combinedquestionnaires\\.')
  df
}

clean_dfs <- function(df) {
  df %>%
    reconcile_typicalday() %>%
    remove_technology_use_scale() %>%
    remove_doctor_told_you() %>%
    remove_permissive_locomotor_milestones_label() %>%
    remove_databrary_fields() %>%
    remove_X_meta_cols() %>%
    remove_redundant_group_labels()
}

Let’s test this workflow with the unmodified files.

Code
df1m <- clean_dfs(df_merge_287_cols_1)
dim(df1m)
[1] 258 272
Code
df2m <- clean_dfs(df_merge_287_cols_2)
dim(df2m)
[1]  61 272
Code
df3m <- clean_dfs(df_merge_288_cols)
dim(df3m)
[1] 539 272
Code
(names(df1m) == names(df2m)) |> sum()
[1] 272
Code
(names(df1m) == names(df3m)) |> sum()
[1] 272

Merging and exporting

Code
df <- rbind(df1m, df2m, df3m)

Save exported aggregate file.

Code
readr::write_csv(df, file = file.path(here::here(), "data/csv/home_visit/agg", "PLAY-non-mcdi-raw-latest.csv"))