Aggregate

Modified

September 17, 2024

About

This page documents the process for combining separate files into aggregate files containing data from many participants.

Setup

fl <-
  list.files(
    file.path(here::here(), "R"),
    "^load|^kobo_|^file_|^screen_|^ecbq_|^health_|^databrary|^home|^make|^export|^post_visit|CONSTANTS|utils",
    full.names = TRUE
  )
purrr::walk(fl, source)

suppressPackageStartupMessages(library(tidyverse))

Remove identifiers

The non-MBCDI file contains the identifiers, so that is the target of this removal process.

Note that we have added data to .gitignore in protocol/, the root directory for the HTML protocol, so none of the data files should be made available via git or GitHub. This also means that there is no version control being done on raw data files themselves.

tar_target(
  home_visit_remove_identifiers,
  purrr::map_chr(
    home_visit_non_mbcdi,
    open_deidentify_save,
    csv_save_dir = "data/csv/home_visit/non_mbcdi/deid",
      these_questions = 'non_mbcdi'
  )
)
home_visit_non_mbcdi <- list.files(file.path(here::here(), "data/csv/home_visit/non_mbcdi/raw"),
                 "\\.csv$", full.names = TRUE)
purrr::map_chr(
    home_visit_non_mbcdi,
    file_open_deidentify_save,
    csv_save_dir = file.path(here::here(), "data/csv/home_visit/non_mbcdi/deid"),
      these_questions = 'non_mbcdi'
  )
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/1136694_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/1151489_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/307736_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/331453_non_mbcdi_24_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/331848_non_mbcdi_12_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/334099_non_mbcdi_12_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363349_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363381_non_mbcdi_24_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363431_non_mbcdi_12_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363465_non_mbcdi_24_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363466_non_mbcdi_18_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/408149_non_mbcdi_24_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411388_non_mbcdi_18_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411456_non_mbcdi_12_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411469_non_mbcdi_12_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740623_non_mbcdi_12_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740624_non_mbcdi_12_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740625_non_mbcdi_12_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740626_non_mbcdi_18_bilingual_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740627_non_mbcdi_18_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740628_non_mbcdi_18_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740629_non_mbcdi_24_english_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740630_non_mbcdi_24_bilingual_spanish_deidentified.csv`
Saved `/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740631_non_mbcdi_24_bilingual_english_deidentified.csv`
 [1] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/1136694_non_mbcdi_18_english_deidentified.csv"         
 [2] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/1151489_non_mbcdi_18_english_deidentified.csv"         
 [3] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/307736_non_mbcdi_18_english_deidentified.csv"          
 [4] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/331453_non_mbcdi_24_english_deidentified.csv"          
 [5] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/331848_non_mbcdi_12_english_deidentified.csv"          
 [6] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/334099_non_mbcdi_12_bilingual_english_deidentified.csv"
 [7] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363349_non_mbcdi_18_english_deidentified.csv"          
 [8] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363381_non_mbcdi_24_english_deidentified.csv"          
 [9] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363431_non_mbcdi_12_english_deidentified.csv"          
[10] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363465_non_mbcdi_24_bilingual_english_deidentified.csv"
[11] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/363466_non_mbcdi_18_bilingual_english_deidentified.csv"
[12] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/408149_non_mbcdi_24_bilingual_spanish_deidentified.csv"
[13] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411388_non_mbcdi_18_bilingual_spanish_deidentified.csv"
[14] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411456_non_mbcdi_12_bilingual_english_deidentified.csv"
[15] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/411469_non_mbcdi_12_bilingual_spanish_deidentified.csv"
[16] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740623_non_mbcdi_12_bilingual_english_deidentified.csv"
[17] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740624_non_mbcdi_12_bilingual_spanish_deidentified.csv"
[18] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740625_non_mbcdi_12_english_deidentified.csv"          
[19] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740626_non_mbcdi_18_bilingual_english_deidentified.csv"
[20] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740627_non_mbcdi_18_bilingual_spanish_deidentified.csv"
[21] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740628_non_mbcdi_18_english_deidentified.csv"          
[22] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740629_non_mbcdi_24_english_deidentified.csv"          
[23] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740630_non_mbcdi_24_bilingual_spanish_deidentified.csv"
[24] "/Users/rog1/rrr/KoBoToolbox/data/csv/home_visit/non_mbcdi/deid/740631_non_mbcdi_24_bilingual_english_deidentified.csv"

Quality assurance (QA) reviews

MB-CDI files

Note

To be completed.

Non-MB-CDI files

Create a helper function to create a data set with summary information about the data files.

summarize_non_mbcdi_qs <- function(fn) {
  stopifnot(is.character(fn))
  
  if (!file.exists(fn)) {
    stop('File not found `', fn, '`')
  } else {
    df <- readr::read_csv(fn, show_col_types = FALSE)
    if (!is.data.frame(df)) {
      stop('Error reading data frame')
    } else {
      out_df <-
        tibble(
          file_name = basename(fn),
          n_rows = dim(df)[1],
          n_vars = dim(df)[2]
        )
      dplyr::arrange(out_df, file_name)
    }
  }
}

Select the de-identified CSVs to examine.

fl <-
  list.files(
    file.path(here::here(), "data/csv/home_visit/non_mbcdi/deid"),
    '^[0-9]+_non_mbcdi_[12|18|24].*deidentified',
    full.names = TRUE
  )

PLAY_forms <- purrr::map_df(fl, summarize_non_mbcdi_qs)

PLAY_forms %>%
  knitr::kable(., format = 'html') %>%
  kableExtra::kable_classic()
file_name n_rows n_vars
1136694_non_mbcdi_18_english_deidentified.csv 0 288
1151489_non_mbcdi_18_english_deidentified.csv 0 286
307736_non_mbcdi_18_english_deidentified.csv 4 274
331453_non_mbcdi_24_english_deidentified.csv 3 274
331848_non_mbcdi_12_english_deidentified.csv 4 267
334099_non_mbcdi_12_bilingual_english_deidentified.csv 1 267
363349_non_mbcdi_18_english_deidentified.csv 9 280
363381_non_mbcdi_24_english_deidentified.csv 8 280
363431_non_mbcdi_12_english_deidentified.csv 10 281
363465_non_mbcdi_24_bilingual_english_deidentified.csv 0 280
363466_non_mbcdi_18_bilingual_english_deidentified.csv 0 280
408149_non_mbcdi_24_bilingual_spanish_deidentified.csv 1 280
411388_non_mbcdi_18_bilingual_spanish_deidentified.csv 0 280
411456_non_mbcdi_12_bilingual_english_deidentified.csv 1 280
411469_non_mbcdi_12_bilingual_spanish_deidentified.csv 1 280
740623_non_mbcdi_12_bilingual_english_deidentified.csv 46 288
740624_non_mbcdi_12_bilingual_spanish_deidentified.csv 4 288
740625_non_mbcdi_12_english_deidentified.csv 238 288
740626_non_mbcdi_18_bilingual_english_deidentified.csv 61 287
740627_non_mbcdi_18_bilingual_spanish_deidentified.csv 7 287
740628_non_mbcdi_18_english_deidentified.csv 223 288
740629_non_mbcdi_24_english_deidentified.csv 181 287
740630_non_mbcdi_24_bilingual_spanish_deidentified.csv 4 287
740631_non_mbcdi_24_bilingual_english_deidentified.csv 47 287

The later forms (with higher form numbers–the leading integers in the file names) are the newer ones. These generally have the largest number of entries and have similar numbers of columns–either 287 or 288. Accordingly, we focus our cleaning efforts here first.

We start with the data files that have \(n=288\) columns.

df740623 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740623_non_mbcdi_12_bilingual_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

df740624 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740624_non_mbcdi_12_bilingual_spanish_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740624))
[1] 288
df740625 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740625_non_mbcdi_12_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740625))
[1] 288
df740628 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740628_non_mbcdi_18_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740628))
[1] 288

So, four of the most recent data files with \(n=288\) columns can be aggregated without modification.

Let’s turn to the more recent files with \(n=287\) columns.

df740626 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740626_non_mbcdi_18_bilingual_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

df740627 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740627_non_mbcdi_18_bilingual_spanish_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740627))
[1] 100

Where does the misalignment arise?

names(df740626) == names(df740627)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

The misalignment arises somewhere near column 92.

df740629 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740629_non_mbcdi_24_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740629))
[1] 287

So, df740626 and df740629 are aligned and can be merged.

df740630 <-
  readr::read_csv(
    file.path(here::here(), "data/csv/home_visit/non_mbcdi/deid/740630_non_mbcdi_24_bilingual_spanish_deidentified.csv"),
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740630))
[1] 100
names(df740626) == names(df740630)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

These files also fall out of alignment near column 92.

df740631 <-
  readr::read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/740631_non_mbcdi_24_bilingual_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740631))
[1] 100
names(df740626) == names(df740631)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

And these files fall out of alignment near column 92.

Let’s see if df740627, df740630, and df740631 are aligned with one another.

sum(names(df740627) == names(df740630))
[1] 287
sum(names(df740627) == names(df740631))
[1] 287

Yes, they are. So, these three can be merged. We do that first, then address the discrepancies between aggregates.

‘Older’ forms

The “older” forms have varied numbers of columns. We focus on thos with data (n_vars > 0)

df307736 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/307736_non_mbcdi_18_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

df331453 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/331453_non_mbcdi_24_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df331848 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/331848_non_mbcdi_12_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df334099 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/334099_non_mbcdi_12_bilingual_english_deidentified.csv"
    ),
    show_col_types = FALSE
  )

df363349 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/363349_non_mbcdi_18_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df363381 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/363381_non_mbcdi_24_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df363431 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/363431_non_mbcdi_12_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df408149 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/408149_non_mbcdi_24_bilingual_spanish_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df411456 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/411456_non_mbcdi_12_bilingual_english_deidentified.csv"
    )
    ,
    show_col_types = FALSE
  )

df411469 <-
  read_csv(
    file.path(
      here::here(),
      "data/csv/home_visit/non_mbcdi/deid/411469_non_mbcdi_12_bilingual_spanish_deidentified.csv"
    ),
    show_col_types = FALSE
  )

Let’s look at the two forms that have the same number of columns, \(n=274\), 307736 and 331453.

names(df307736) == names(df331453)
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[256] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[271] TRUE TRUE TRUE TRUE
length(names(df307736) == names(df331453)) == length(names(df307736))
[1] TRUE

So, these two are identical and could be merged.

hv_deid_fl <- list.files(file.path(here::here(), "data/csv/home_visit/non_mbcdi/deid"), "\\.csv$", full.names = TRUE)

files_274_cols <- stringr::str_detect(hv_deid_fl, "/(307736|331453)")

df_merge_274_cols <- file_make_aggregate_from_csvs(hv_deid_fl[files_274_cols])

How about the files with \(n=267\) columns, 331848 and 334099?

names(df331848) == names(df334099)
  [1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[265]  TRUE  TRUE  TRUE
length(names(df331848) == names(df334099)) == length(names(df331848))
[1] TRUE
names(df331848) |> head()
[1] "group_combinedquestionnaires/participant_id"             
[2] "start"                                                   
[3] "end"                                                     
[4] "group_combinedquestionnaires/note_fillthisoutbeforestudy"
[5] "group_combinedquestionnaires/site_id"                    
[6] "group_combinedquestionnaires/subject_number"             
names(df334099) |> head()
[1] "group_jo84c13/participant_id"             
[2] "start"                                    
[3] "end"                                      
[4] "group_jo84c13/note_fillthisoutbeforestudy"
[5] "group_jo84c13/site_id"                    
[6] "group_jo84c13/subject_number"             

There is an odd difference in the group label, group_combinedquestionnaires vs. group_jo84c13.

Let’s try deleting the initial group labels and compare again.

n1 <- names(df331848)
n2 <- names(df334099)

names(df331848) %>% stringr::str_remove("group_combinedquestionnaires/") |> head()
[1] "participant_id"              "start"                      
[3] "end"                         "note_fillthisoutbeforestudy"
[5] "site_id"                     "subject_number"             
names(df334099) %>% stringr::str_remove("group_jo84c13/") |> head()
[1] "participant_id"              "start"                      
[3] "end"                         "note_fillthisoutbeforestudy"
[5] "site_id"                     "subject_number"             

That looks promising.

names(df331848) %>% stringr::str_remove("group_combinedquestionnaires/") -> n1
names(df334099) %>% stringr::str_remove("group_jo84c13/") -> n2
n1 == n2
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
 [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[265]  TRUE  TRUE  TRUE
cbind(n1[8:15], n2[8:15])
     [,1]                     [,2]                    
[1,] "test_date"              "test_date"             
[2,] "child_sex"              "child_birth_date"      
[3,] "age_group"              "child_sex"             
[4,] "language_child"         "age_group"             
[5,] "language_child/english" "language_child"        
[6,] "language_child/spanish" "language_child/english"
[7,] "language_instruction"   "language_child/spanish"
[8,] "acknowledge_site"       "language_instruction"  

n2 or df334099 has a child_birth_date field in position 9 that the other data frame does not have.

n1 |> str_detect("child_birth_date") |> sum()
[1] 0

If we delete that variable, the data frames will no longer have the same number of columns. Let’s explore that anyway.

n2_2 <- n2[-9]

n1 == n2_2
Warning in n1 == n2_2: longer object length is not a multiple of shorter object
length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE
 [37]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [49] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE

That helps a bit, but we diverge around column 29.

cbind(n1[28:51], n2_2[28:51])
      [,1]                                                                                                
 [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested"              
 [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/birthhospital"
 [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/afterhome"    
 [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/no"           
 [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/refused"      
 [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/donotknow"    
 [7,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested"               
 [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/birthhospital" 
 [9,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/afterhome"     
[10,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/no"            
[11,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/refused"       
[12,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"     
[13,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/allergies"       
[14,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/ear_infection"   
[15,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/asthma"          
[16,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/respiratory"     
[17,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/gastrointestinal"
[18,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"              
[19,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                
[20,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                    
[21,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"           
[22,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                   
[23,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"                           
[24,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                       
      [,2]                                                                                                                                  
 [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested"                                                
 [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/yes__in_the_bi"                                 
 [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/yes__after_goi"                                 
 [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/no"                                             
 [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/refused"                                        
 [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/don_t_know"                                     
 [7,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested"                                                 
 [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/yes__in_the_bi"                                  
 [9,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/yes__after_goi"                                  
[10,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/no"                                              
[11,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/refused"                                         
[12,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/don_t_know"                                      
[13,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/child_allergies_infections_ill_header"
[14,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/allergies"                            
[15,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/ear_infection"                        
[16,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/asthma"                               
[17,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/respiratory"                          
[18,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/gastrointestinal"                     
[19,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"                                                
[20,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                                                  
[21,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                                                      
[22,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"                                             
[23,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                                                     
[24,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                                                         

These question labels looks very similar. There are just some minor changes in the variable names. n2_2 has an extra variable in column 40.

n2_3 <- n2_2[-40]

Then, we can rename some of the columns in n2_3 using corresponding names from n1.

n2_3 |> stringr::str_replace("yes__in_the_bi", "birthhospital") |> stringr::str_replace("yes__after_goi", "afterhome") |> stringr::str_replace("don_t_know", "donotknow") -> n2_4

n1 == n2_4
Warning in n1 == n2_4: longer object length is not a multiple of shorter object
length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
[229]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE
cbind(n1[39:51], n2_4[39:51])
      [,1]                                                                                                
 [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"     
 [2,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/allergies"       
 [3,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/ear_infection"   
 [4,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/asthma"          
 [5,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/respiratory"     
 [6,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/gastrointestinal"
 [7,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"              
 [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                
 [9,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                    
[10,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"           
[11,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                   
[12,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"                           
[13,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                       
      [,2]                                                                                                             
 [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"                  
 [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/allergies"       
 [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/ear_infection"   
 [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/asthma"          
 [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/respiratory"     
 [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/gastrointestinal"
 [7,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"                           
 [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                             
 [9,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                                 
[10,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"                        
[11,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                                
[12,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                                    
[13,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"                                      

n1 has a group_medicalprof label from allergies through gastrointestinal; n2_4 has child_allergies_infections_ill for the same questions.

n2_4 |> stringr::str_replace("child_allergies_infections_ill", "group_medicalprof") -> n2_5
n1 == n2_5
Warning in n1 == n2_5: longer object length is not a multiple of shorter object
length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
[229]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE
cbind(n1[49:60], n2_5[49:60])
      [,1]                                                                             
 [1,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"
 [2,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"        
 [3,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"    
 [4,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"      
 [5,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_1"   
 [6,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_2"   
 [7,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_3"   
 [8,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now"       
 [9,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now_amount"
[10,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_house"         
[11,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_car"           
[12,] "group_homevisitquestionnaires/group_health/group_smoking/comments_smoking"      
      [,2]                                                                             
 [1,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"
 [2,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"    
 [3,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"      
 [4,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_1"   
 [5,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_2"   
 [6,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_3"   
 [7,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now"       
 [8,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now_amount"
 [9,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_house"         
[10,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_car"           
[11,] "group_homevisitquestionnaires/group_health/group_smoking/comments_smoking"      
[12,] "group_homevisitquestionnaires/group_health/group_drinking/pregnant_drinking"    

It looks like these could be reconciled by deleting prenatal_care from n1.

n1_2 <- n1[-50]
n1_2 == n2_5
Warning in n1_2 == n2_5: longer object length is not a multiple of shorter
object length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
 [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE
cbind(n1_2[64:75], n2_5[64:75])
      [,1]                                                                                                          
 [1,] "group_homevisitquestionnaires/group_health/group_drinking/comments_drinking"                                 
 [2,] "group_homevisitquestionnaires/group_health/group_phq4/note_phq4"                                             
 [3,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_nervous"                           
 [4,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_worrying"                          
 [5,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_littleinterest"                    
 [6,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_down"                              
 [7,] "group_homevisitquestionnaires/group_health/group_phq4/comments_phq4"                                         
 [8,] "group_homevisitquestionnaires/group_rothbart/group_rothbartinstructions/rothbart_instructions1"              
 [9,] "group_homevisitquestionnaires/group_rothbart/group_rothbartinstructions/rothbart_instructions2"              
[10,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_unfamiliarperson"
[11,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_troubletask"     
[12,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_companyofchild"  
      [,2]                                                                                                       
 [1,] "group_homevisitquestionnaires/group_health/group_drinking/comments_drinking"                              
 [2,] "group_homevisitquestionnaires/group_health/group_phq4/Experimenter_These_stions_are_about_you"            
 [3,] "group_homevisitquestionnaires/group_health/group_phq4/comments_phq4"                                      
 [4,] "group_homevisitquestionnaires/group_rothbart/group_dd2kz32/instructor_rothbart"                           
 [5,] "group_homevisitquestionnaires/group_rothbart/group_dd2kz32/instructions_rothbart2"                        
 [6,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/rothbart_questions_header"                
 [7,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_approached_by_a_ld_cling_to_a_parent"
 [8,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/While_having_trouble_get_easily_irritated"
 [9,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_a_familiar_chil_company_of_the_child"
[10,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_offered_a_choic_uickly_and_go_for_it"
[11,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/During_daily_or_even_eing_quietly_sung_to"
[12,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/While_playing_outdoo_and_excitement_of_it"

It looks like the phq4 is not in n2_5.

Let’s check.

n2_5 |> stringr::str_detect("phq4") |> sum()
[1] 2

Yes, there are only two PHQ4-related questions in df334099.

df334099 |> names() |> stringr::str_detect("phq4") |> sum()
[1] 2

This path of reconciliation does not appear fruitful.

Make aggregate files

non-MB-CDI files with \(n=288\) columns

files_288_cols <- stringr::str_detect(hv_deid_fl, "2[3458]_non_mbcdi.*_deidentified\\.csv")

df_merge_288_cols <- file_make_aggregate_from_csvs(hv_deid_fl[files_288_cols])

non-MB-CDI files with \(n=287\) columns

files_287_cols_1 <- stringr::str_detect(hv_deid_fl, "2[69]_non_mbcdi.*_deidentified\\.csv")

files_287_cols_2 <- stringr::str_detect(hv_deid_fl, "(740627|740630|740631)_non.*_deidentified\\.csv")

df_merge_287_cols_1 <- file_make_aggregate_from_csvs(hv_deid_fl[files_287_cols_1])

df_merge_287_cols_2 <- file_make_aggregate_from_csvs(hv_deid_fl[files_287_cols_2])

Examine groups with \(n=287\) cols

We focus on the starting column where the column names diverge, column 92.

# targets::tar_load(df_merge_287_cols_1, store="../_targets")
# targets::tar_load(df_merge_287_cols_2, store="../_targets")
names(df_merge_287_cols_1)[92]
[1] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_feeding_nutrition.instructions_feeding"
names(df_merge_287_cols_2)[92]
[1] "group_combinedquestionnaires.group_homevisitquestionnaires.group_locomotor_milestones.group_health.group_feeding_nutrition.instructions_feeding"

There is an erroneous group_locomotor_milestones. in the df_merge_287_cols_2 column name.

A bit of sleuthing determines that this group_locomotor_milestones. label is characteristic of columns 92 to 273.

names(df_merge_287_cols_2)[92:273] |> stringr::str_detect(pattern = "group_locomotor_milestones")
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[181] TRUE TRUE

The following should fix this.

old_names <- names(df_merge_287_cols_2)
new_names <- old_names
new_names[92:273] <-
  stringr::str_remove(new_names[92:273], "group_locomotor_milestones\\.")
names(df_merge_287_cols_2) <- new_names
names(df_merge_287_cols_2) == names(df_merge_287_cols_1)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[217]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[229]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[253]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
[265]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
[277] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

We have a second problem with columns from 114 to 210.

rbind(names(df_merge_287_cols_1)[113:115], names(df_merge_287_cols_2)[113:115])
     [,1]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_vision_tested.donotknow"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_vision_tested.donotknow"
     [,2]                                                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
     [,3]                                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"

One of the problems has to do with column 114. There is a question ending doctor_told_you in names(df_merge_287_cols_1) but not in names(df_merge_287_cols_2).

names(df_merge_287_cols_1) |> stringr::str_detect(pattern = "doctor_told_you") |> sum()
[1] 1
names(df_merge_287_cols_2) |> stringr::str_detect(pattern = "doctor_told_you") |> sum()
[1] 0

Deleting this question would create additional misalignments and further problems. We cannot proceed without further discussion with our team.

For now, let’s generate an array with all of the remaining differences in column names.

names_differ <- (names(df_merge_287_cols_2) != names(df_merge_287_cols_1))
sum(names_differ)
[1] 103
rbind(names(df_merge_287_cols_1)[names_differ], names(df_merge_287_cols_2)[names_differ])
     [,1]                                                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
     [,2]                                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
     [,3]                                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.asthma"       
     [,4]                                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.asthma"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.respiratory"
     [,5]                                                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.respiratory"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.gastrointestinal"
     [,6]                                                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.gastrointestinal"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_allergy_etc"                           
     [,7]                                                                                                               
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_allergy_etc"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_injury_times"  
     [,8]                                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_injury_times"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comment_injury"    
     [,9]                                                                                                                  
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comment_injury"         
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_general_health"
     [,10]                                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_general_health"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.instructions_prenatal"        
     [,11]                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.instructions_prenatal"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.prenatal_care"        
     [,12]                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.prenatal_care"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.comments_prenatal"
     [,13]                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.comments_prenatal"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.pregnant_smoking"  
     [,14]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.pregnant_smoking"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_1"
     [,15]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_1"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_2"
     [,16]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_2"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_3"
     [,17]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_3"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now"    
     [,18]                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now"       
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now_amount"
     [,19]                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now_amount"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_house"         
     [,20]                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_house"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_car"  
     [,21]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_car"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.comments_smoking"
     [,22]                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.comments_smoking"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.pregnant_drinking"
     [,23]                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.pregnant_drinking"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_1"
     [,24]                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_1"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_2"
     [,25]                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_2"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_3"
     [,26]                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_3"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.comments_drinking"   
     [,27]                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.comments_drinking"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.note_phq4"            
     [,28]                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.note_phq4"        
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.instructions_phq4"
     [,29]                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.instructions_phq4"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_nervous"     
     [,30]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_nervous" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_worrying"
     [,31]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_worrying"      
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_littleinterest"
     [,32]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_littleinterest"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_down"          
     [,33]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_down"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.comments_phq4"
     [,34]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.comments_phq4"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart1"
     [,35]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart1"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart2"
     [,36]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart2"                        
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.instructions_rothbart3"
     [,37]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.instructions_rothbart3"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_unfamiliarperson"
     [,38]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_unfamiliarperson"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_troubletask"     
     [,39]                                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_troubletask"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_companyofchild"
     [,40]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_companyofchild"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_choiceactivities"
     [,41]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_choiceactivities"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_quietlysung"     
     [,42]                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_quietlysung"    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_playingoutdoors"
     [,43]                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_playingoutdoors"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_morethan10"     
     [,44]                                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_morethan10"       
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_respondingremarks"
     [,45]                                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_respondingremarks" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitedlovedadults"
     [,46]                                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitedlovedadults"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_fiddlehair"        
     [,47]                                                                                                                  
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_fiddlehair"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_roughrowdy"
     [,48]                                                                                                                    
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_roughrowdy"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedhugged"
     [,49]                                                                                                                           
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedhugged"       
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_involvednewactivity"
     [,50]                                                                                                                           
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_involvednewactivity"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tirequickly"        
     [,51]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tirequickly"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_callattention"
     [,52]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_callattention"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tags"         
     [,53]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tags"            
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_noisyenvironment"
     [,54]                                                                                                                        
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_noisyenvironment"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_energy"          
     [,55]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_energy"  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_vehicles"
     [,56]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_vehicles"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_active"  
     [,57]                                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_active"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_forbidden"
     [,58]                                                                                                                    
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_forbidden"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_sadlytearful"
     [,59]                                                                                                                    
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_sadlytearful"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_downblue"    
     [,60]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_downblue"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_runhouse"
     [,61]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_runhouse"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitingevent"
     [,62]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitingevent"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tempertantrum"
     [,63]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tempertantrum"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_waitpatiently"
     [,64]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_waitpatiently"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedsmile"  
     [,65]                                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedsmile"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_mold"       
     [,66]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_mold"         
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_interactadult"
     [,67]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_interactadult"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_careful"      
     [,68]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_careful"      
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_enternewplace"
     [,69]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_enternewplace"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_crymorethan3" 
     [,70]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_crymorethan3" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_easilysoothed"
     [,71]                                                                                                                     
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_easilysoothed"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_busyother"    
     [,72]                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_busyother"      
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_differentpeople"
     [,73]                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_differentpeople"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.comments_rothbart"                               
     [,74]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.comments_rothbart"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.mediause_instructions1"
     [,75]                                                                                             
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.mediause_instructions1"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology"       
     [,76]                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.tv"
     [,77]                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.tv" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.dvd"
     [,78]                                                                                               
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.dvd"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.computer"
     [,79]                                                                                               
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.computer"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.ipad"    
     [,80]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.ipad"           
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.educationalgame"
     [,81]                                                                                                      
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.educationalgame"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.videogame"      
     [,82]                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.videogame"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_tv"      
     [,83]                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_tv"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_how"             
     [,84]                                                                                           
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_how"              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_dvd"
     [,85]                                                                                           
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_dvd"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.dvd_how"             
     [,86]                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.dvd_how"                  
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_computer"
     [,87]                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_computer"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.computer_how"             
     [,88]                                                                                            
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.computer_how"         
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_ipad"
     [,89]                                                                                            
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_ipad"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.ipad_how"             
     [,90]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.ipad_how"                    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_educational"
     [,91]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_educational"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.educational_how"             
     [,92]                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.educational_how"           
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_videogame"
     [,93]                                                                                                 
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_videogame"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.videogame_how"             
     [,94]                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.videogame_how"   
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_hours_per_day"
     [,95]                                                                                            
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_hours_per_day"     
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.note_tv_hours_per_day"
     [,96]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.note_tv_hours_per_day"                    
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.instructions_technology_use"
     [,97]                                                                                                                
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.instructions_technology_use"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.technology_use_scale"       
     [,98]                                                                                                  
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typicalday" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typical_day"
     [,99]                                                                                            
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel1"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel1"
     [,100]                                                                                           
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel2"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel2"
     [,101]                                                                                           
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel3"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel3"
     [,102]                                                                                            
[1,] "group_combinedquestionnaires.group_databrary.acknowledge_databrary"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.acknowledge_databrary"
     [,103]                                                                                       
[1,] "group_combinedquestionnaires.group_databrary.note_saveasdraft"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_saveasdraft"

Visual inspection suggests that these are similar with the following deviations:

  • As noted, df_merge_287_cols_1 has a column ending doctor_told_you that is not present in df_merge_287_cols_2.
  • df_merge_287_cols_2 has a column ending technology_use_scale that is not present in the df_merge_287_cols_1
  • There are a set of fields in group_databrary that do not align exactly. We will almost certainly delete these, so the misalignment is not a huge problem.

As an exploration, let’s see if we can reconcile these by deleting the non-aligning columns.

df1 <- df_merge_287_cols_1
df2 <- df_merge_287_cols_2

df1 <- df1 %>%
  dplyr::select(., -contains('doctor_told_you'))

df2 <- df2 %>%
  dplyr::select(., -contains('technology_use_scale'))

old_names <- names(df2)
new_names <- old_names
new_names[92:273] <- stringr::str_remove(new_names[92:273], "group_locomotor_milestones\\.")
names(df2) <- new_names

names(df1) == names(df2)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[157]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[169]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[205]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[217]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[229]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[253]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
[265]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
[277] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

This looks promising.

rbind(names(df1)[263], names(df2)[263])
     [,1]                                                                                                   
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typicalday" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typical_day"

This is easily fixed.

names(df1)[263] <- names(df2)[263]
rbind(names(df1)[273:275], names(df2)[273:275])
     [,1]                                                                                             
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel1"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel1"
     [,2]                                                                                             
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel2"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel2"
     [,3]                                                                                             
[1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel3"                              
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel3"

The last misalignments relate to Databrary fields.

df1 <- df1 %>%
  dplyr::select(., -contains('group_databrary'))

df2 <- df2 %>%
  dplyr::select(., -contains('group_databrary'))

names(df1) == names(df2)
  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[151] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[196] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[226] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[256] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[271] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Success!

Combining the two groups of datasets

Now, let’s go back to the data frame with 288 cols and see if we can bring these into alignment.

df3 <- df_merge_288_cols

df3 <- df3 %>%
  dplyr::select(., -contains('group_databrary'))

c(dim(df1), dim(df2), dim(df3))
[1] 242 281  58 281 511 283
names(df1) == names(df3)
Warning in names(df1) == names(df3): longer object length is not a multiple of
shorter object length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
rbind(names(df1)[114:115], names(df3)[114:115])
     [,1]                                                                                                                                         
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
     [,2]                                                                                                                                       
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"    

Once again, there appears to be a problem with the ‘doctor_told_you’ field. We’ll delete it to see if this fixes one of the problems.

df3 <- df3 %>%
  dplyr::select(., -contains('doctor_told_you'))

names(df1) == names(df3)
Warning in names(df1) == names(df3): longer object length is not a multiple of
shorter object length
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[157]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[169]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[205]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[277] FALSE FALSE FALSE FALSE FALSE FALSE

We still have misalignments at column 210.

rbind(names(df1)[210:213], names(df3)[210:213])
     [,1]                                                                                                          
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.meals"               
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.technology_use_scale"
     [,2]                                                                                              
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.playtime"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.meals"   
     [,3]                                                                                              
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.bedtime" 
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.playtime"
     [,4]                                                                                                    
[1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.transportation"
[2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.bedtime"       

The ’technology_use_scale` exists in one but not the other.

df3 <- df3 %>%
  dplyr::select(., -contains('technology_use_scale'))

rbind(dim(df1), dim(df3))
     [,1] [,2]
[1,]  242  281
[2,]  511  281
names(df1) == names(df3)
  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [13]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [25]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [37]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
 [97]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[109]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[145]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[157]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[169]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[193]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[205]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[217]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[229]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[253]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
[265]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
[277]  TRUE  TRUE  TRUE  TRUE  TRUE

Future versions of the workflow will need to handle this more elegantly.

Option 1: Fix the underlying forms.

Option 2: Add the ‘missing’ columns as NA in post-processing.

For now, I’m going to create functions that align these data frames. These are incorporated into R/utils.R so we do not source them again here.

remove_technology_use_scale <- function(df) {
  dplyr::select(df, -contains('technology_use_scale'))
}

remove_doctor_told_you <- function(df) {
  dplyr::select(df, -contains('doctor_told_you'))
}

remove_databrary_fields <- function(df) {
  dplyr::select(df, -contains('group_databrary'))
}

reconcile_typicalday <- function(df) {
  names(df) <- stringr::str_replace_all(names(df), 'typicalday', 'typical_day')
  df
}

remove_permissive_locomotor_milestones_label <- function(df) {
  old_names <- names(df)
  new_names <- old_names
  contains_locomotor <-
    stringr::str_detect(new_names, pattern = "locomotor_milestones.*health|division|rothbart|mediause|pets|typical|acknowledge")
  new_names[contains_locomotor] <-
    stringr::str_remove(new_names[contains_locomotor], "group_locomotor_milestones\\.")
  names(df) <- new_names
  df
}

remove_X_meta_cols <- function(df) {
  dplyr::select(df, -contains("X_"), -contains("meta.instanceID"))
}

remove_redundant_group_labels <- function(df) {
  names(df) <- stringr::str_remove_all(names(df), 'group_homevisitquestionnaires\\.')
  names(df) <- stringr::str_remove_all(names(df), 'group_combinedquestionnaires\\.')
  df
}

clean_dfs <- function(df) {
  df %>%
    reconcile_typicalday() %>%
    remove_technology_use_scale() %>%
    remove_doctor_told_you() %>%
    remove_permissive_locomotor_milestones_label() %>%
    remove_databrary_fields() %>%
    remove_X_meta_cols() %>%
    remove_redundant_group_labels()
}

Let’s test this workflow with the unmodified files.

df1m <- clean_dfs(df_merge_287_cols_1)
dim(df1m)
[1] 242 272
df2m <- clean_dfs(df_merge_287_cols_2)
dim(df2m)
[1]  58 272
df3m <- clean_dfs(df_merge_288_cols)
dim(df3m)
[1] 511 272
(names(df1m) == names(df2m)) |> sum()
[1] 272
(names(df1m) == names(df3m)) |> sum()
[1] 272

Merging and exporting

df <- rbind(df1m, df2m, df3m)

Save exported aggregate file.

readr::write_csv(df, file = file.path(here::here(), "data/csv/home_visit/agg", "PLAY-non-mcdi-raw-latest.csv"))