Home visit

Protocol

Details about the data collection protocol for the home visit can be found on the PLAY Project website.

Overview

The notes below summarize the steps needed to process the home visit survey data. Most of the R chunks are not run, however, since we have moved the work into a \{targets\}-based workflow.

Download data

Data files for each of the language by age-group conditions are stored on KoBoToolbox (KBT).

Store all of the data files on KBT in kb_df.

tar_target(kb_df, list_kobo_data()),
library(targets)
targets::tar_load(kb_df, store="../_targets")
kb_df

List data forms specific to the home visit by filtering the files with names that contain “Home”.

tar_target(kb_home, dplyr::filter(kb_df, stringr::str_detect(title, "Home")))
targets::tar_load(kb_home, store="../_targets")
kb_home
##         id              id_string
## 1   411456 a58ZhjX6M8WXtzLGvBaEaG
## 2   740625 a8HyzdxnVjK53dt4fX3Vup
## 3   331453 aae4wz2tpkSnAkea2PAzin
## 4   363465 abYGMsrMvkGsnA4xm9kqC9
## 5   363431 acCBJaRqBkSMPYdvaaWCSu
## 6   334099 ad3NtS6TDPuKTUKCAyCuby
## 7   363349 aEMERWLYQAdhaMKMt5QkD7
## 8   307736 aeMJQLo56HWyVHkKX9oaJD
## 9   740631 ahYJ7XRUcit8gbFYwaxnAV
## 10 1151489 aLqdPLTCZb8jvFkDxcFwzy
## 11 1136694 aMteoHisncRUStdyZfsjMB
## 12  740623 aoLcmmN8KWG4N6A3XpUiRT
## 13  740628 aRJW7SwKsVyZu8b4FighiM
## 14  363466 asv7tL6CEYjiW2XCkLxhcX
## 15  411469 aswcEAtxj24u9MfuBmS9AW
## 16  740630 aThMD6zuEFjtTfptwcpZPg
## 17  411388 aTyGBYk6HvRXmki9ksb29p
## 18  740624 aUeYzPfq6WED4dDF28VT9D
## 19  363381 aWd7wujF5d5HYaYMhT4jBZ
## 20  740626 awMnFDna5kWR3LXKyGmFMd
## 21  331848 axYbEbLcJxzUNFbqMVySjJ
## 22  740629 axyJSgLRDeSk3BHB6cJmYk
## 23  408149 ayhCy68ZTaDGc7tuGthQpJ
## 24  740627 aYokcZAjTZ9JCyVXz4R5rh
##                                                                      title
## 1              PLAY Home Questionaires - 12 Bilingual English (2020-03-04)
## 2                                        PLAY_HomeQuestionaires_12_English
## 3                                    PLAY Home Questionnaires (24 English)
## 4             PLAY Home Questionnaires - 24 Bilingual English (2020-03-04)
## 5                       PLAY Home Questionnaires - 12 English (2020-03-04)
## 6                        PLAY Home Questionnaires (12 Bilingual - English)
## 7                       PLAY Home Questionnaires - 18 English (2020-03-04)
## 8                                    PLAY Home Questionnaires (18 English)
## 9                              PLAY_HomeQuestionaires_24_Bilingual_English
## 10 Clone of Clone of PLAY_HomeQuestionnaires_18_English - NO AUTO METADATA
## 11             Clone of PLAY_HomeQuestionnaires_18_English - NO AUTO TODAY
## 12                             PLAY_HomeQuestionaires_12_Bilingual_English
## 13                                      PLAY_HomeQuestionnaires_18_English
## 14            PLAY Home Questionnaires - 18 Bilingual English (2020-03-04)
## 15            PLAY Home Questionnaires - 12 Bilingual Spanish (2020-03-04)
## 16                             PLAY_HomeQuestionaires_24_Bilingual_Spanish
## 17            PLAY Home Questionnaires - 18 Bilingual Spanish (2020-03-04)
## 18                             PLAY_HomeQuestionaires_12_Bilingual_Spanish
## 19                      PLAY Home Questionnaires - 24 English (2020-03-04)
## 20                             PLAY_HomeQuestionaires_18_Bilingual_English
## 21                                   PLAY Home Questionnaires (12 English)
## 22                                      PLAY_HomeQuestionnaires_24_English
## 23            PLAY Home Questionnaires - 24 Bilingual Spanish (2020-03-04)
## 24                             PLAY_HomeQuestionaires_18_Bilingual_Spanish
##                                                                description
## 1            PLAY Home Questionaires - 12 Bilingual (English) (2020-03-04)
## 2                             PLAY_HomeQuestionaires_12_English-2021-07-29
## 3                                    PLAY Home Questionnaires (24 English)
## 4             PLAY Home Questionnaires - 24 Bilingual English (2019-11-20)
## 5                       PLAY Home Questionnaires - 12 English (2019-11-20)
## 6                        PLAY Home Questionnaires (12 Bilingual - English)
## 7                       PLAY Home Questionnaires - 18 English (2018-11-20)
## 8                                    PLAY Home Questionnaires (18 English)
## 9                   PLAY_HomeQuestionaires_24_Bilingual_English-2021-07-30
## 10 Clone of Clone of PLAY_HomeQuestionnaires_18_English - NO AUTO METADATA
## 11                             Clone of PLAY_HomeQuestionnaires_18_English
## 12                  PLAY_HomeQuestionaires_12_Bilingual_English-2021-07-29
## 13                           PLAY_HomeQuestionnaires_18_English-2021-07-29
## 14            PLAY Home Questionnaires - 18 Bilingual English (2019-11-20)
## 15            PLAY Home Questionnaires - 12 Bilingual Spanish (2020-03-04)
## 16                  PLAY_HomeQuestionaires_24_Bilingual_Spanish-2021-07-30
## 17            PLAY Home Questionnaires - 18 Bilingual Spanish (2020-03-04)
## 18                  PLAY_HomeQuestionaires_12_Bilingual_Spanish-2021-07-29
## 19                      PLAY Home Questionnaires - 24 English (2019-11-20)
## 20                  PLAY_HomeQuestionaires_18_Bilingual_English-2021-07-29
## 21                                   PLAY Home Questionnaires (12 English)
## 22                           PLAY_HomeQuestionnaires_24_English-2021-07-30
## 23            PLAY Home Questionnaires - 24 Bilingual Spanish (2020-02-27)
## 24                  PLAY_HomeQuestionaires_18_Bilingual_Spanish-2021-07-29
##                                               url
## 1   https://kc.kobotoolbox.org/api/v1/data/411456
## 2   https://kc.kobotoolbox.org/api/v1/data/740625
## 3   https://kc.kobotoolbox.org/api/v1/data/331453
## 4   https://kc.kobotoolbox.org/api/v1/data/363465
## 5   https://kc.kobotoolbox.org/api/v1/data/363431
## 6   https://kc.kobotoolbox.org/api/v1/data/334099
## 7   https://kc.kobotoolbox.org/api/v1/data/363349
## 8   https://kc.kobotoolbox.org/api/v1/data/307736
## 9   https://kc.kobotoolbox.org/api/v1/data/740631
## 10 https://kc.kobotoolbox.org/api/v1/data/1151489
## 11 https://kc.kobotoolbox.org/api/v1/data/1136694
## 12  https://kc.kobotoolbox.org/api/v1/data/740623
## 13  https://kc.kobotoolbox.org/api/v1/data/740628
## 14  https://kc.kobotoolbox.org/api/v1/data/363466
## 15  https://kc.kobotoolbox.org/api/v1/data/411469
## 16  https://kc.kobotoolbox.org/api/v1/data/740630
## 17  https://kc.kobotoolbox.org/api/v1/data/411388
## 18  https://kc.kobotoolbox.org/api/v1/data/740624
## 19  https://kc.kobotoolbox.org/api/v1/data/363381
## 20  https://kc.kobotoolbox.org/api/v1/data/740626
## 21  https://kc.kobotoolbox.org/api/v1/data/331848
## 22  https://kc.kobotoolbox.org/api/v1/data/740629
## 23  https://kc.kobotoolbox.org/api/v1/data/408149
## 24  https://kc.kobotoolbox.org/api/v1/data/740627

Save selected raw files to local directory

Prepare to retrieve all home visit files.

n_files <- dim(kb_home)[1]

There are \(n=\) 24 home visit data files.

tar_target(
  home_visit_downloads,
  retrieve_kobo_xlsx(kb_home, 
    "data/xlsx/home_visit/raw"),
  cue = tarchetypes::tar_cue_age(
    name = home_visit_downloads,
    age = as.difftime(update_interval, 
      units = update_interval_units)
  )
)

Normalize file names

Some of the form names are inconsistent, so we normalize them to fit the following pattern:

<form_id>_PLAY_HomeQuestionnaires_<age_group>_<lang_group>.xlsx

tar_target(
  home_visit_renamed,
  rename_home_xlsx(home_visit_downloads,
                     "data/xlsx/home_visit/std_name"),
  cue = tarchetypes::tar_cue_age(
    name = home_visit_renamed,
    age = as.difftime(update_interval, 
      units = update_interval_units)
  )
)

Save xlsx as csv

tar_target(
  home_visit_xlsx_to_csv,
  load_xlsx_save_many_csvs_2(home_visit_renamed, 
    "data/csv/home_visit/raw")
)

Split MB-CDI from other questions

Next we import a CSV for a given form year, age group, and language group, and create two new CSV files: one with the MB-CDI data and one with all of the other survey questions.

By default, the document presumes that we want to convert all of the CSV files

Extract the ‘non-mbcdi’ questions first and add ‘non_mbcdi’ to the filename.

tar_target(
  home_visit_non_mbcdi,
  split_non_mbcdi_csvs(home_visit_xlsx_to_csv,
                        "data/csv/home_visit/non_mbcdi")
)

Extracting the MB-CDI data has nearly the same function call, but the these_questions parameter is set to ‘mbcdi’.

tar_target(
  home_visit_mbcdi,
  split_mbcdi_csvs(home_visit_xlsx_to_csv,
                    "data/csv/home_visit/mbcdi")
)

Clean data

Remove identifiers

The function remove_identifiers() in R/kobo_export detects the presence of names, addresses, phone numbers, email, and dates in the field names for an input file and removes these fields. It also modifies the file name by appending _deidentified.

The remove_identifiers() function detects these fields For clarity, we print it here.

source("~/rrr/KoBoToolbox/R/_OLD/functions.R", echo = FALSE, print.eval = FALSE)
remove_identifiers
## function(df) {
##   require(stringr)
##   stopifnot(is.data.frame(df))
##   
##   contains_name <- stringr::str_detect(names(df), 'name')
##   contains_address <- stringr::str_detect(names(df), 'address')
##   contains_phone <- stringr::str_detect(names(df), 'phone')
##   contains_email <- stringr::str_detect(names(df), 'email')
##   contains_birthdate <- stringr::str_detect(names(df), 'birthdate')
##   contains_first <- stringr::str_detect(names(df), 'first[12]?')
##   contains_last <- stringr::str_detect(names(df), 'last[12]?')
##   contains_city <- stringr::str_detect(names(df), 'city')
##   contains_year <- stringr::str_detect(names(df), 'year[12]?')
##   contains_month <- stringr::str_detect(names(df), 'month[12]?')
##   contains_day <- stringr::str_detect(names(df), '/day[12]?$')
##   
##   identifiable_data <- contains_name | contains_address |
##     contains_phone |
##     contains_email | contains_birthdate | contains_first |
##     contains_last |
##     contains_city | contains_year | contains_month | contains_day
##   
##   identifiable_cols <- (1:length(names(df)))[identifiable_data]
##   
##   df_deidentified <- df %>%
##     dplyr::select(.,-all_of(identifiable_cols))
##   
##   df_deidentified
## }

The non-MBCDI file contains the identifiers, so that is the target of this removal process.

Note that we have added data to .gitignore in protocol/, the root directory for the HTML protocol, so none of the data files should be made available via git or GitHub. This also means that there is no version control being done on raw data files themselves.

tar_target(
  home_visit_remove_identifiers,
  purrr::map_chr(
    home_visit_non_mbcdi,
    open_deidentify_save,
    csv_save_dir = "data/csv/home_visit/non_mbcdi/deid",
      these_questions = 'non_mbcdi'
  )
)

Quality assurance (QA) reviews

MB-CDI files

Non-MB-CDI files

Create a helper function to create a data set with summary information about the data files.

summarize_non_mbcdi_qs <- function(fn) {
  stopifnot(is.character(fn))
  
  if (!file.exists(fn)) {
    stop('File not found `', fn, '`')
  } else {
    df <- readr::read_csv(fn, show_col_types = FALSE)
    if (!is.data.frame(df)) {
      stop('Error reading data frame')
    } else {
      out_df <-
        tibble(
          file_name = basename(fn),
          n_rows = dim(df)[1],
          n_vars = dim(df)[2]
        )
      dplyr::arrange(out_df, file_name)
    }
  }
}

Select the de-identified CSVs to examine.

fl <-
  list.files(
    file.path("../data/csv/home_visit/non_mbcdi/deid"),
    '^[0-9]+_non_mbcdi_[12|18|24].*deidentified',
    full.names = TRUE
  )

PLAY_forms <- purrr::map_df(fl, summarize_non_mbcdi_qs)

PLAY_forms %>%
  knitr::kable(., format = 'html') %>%
  kableExtra::kable_classic()
file_name n_rows n_vars
1136694_non_mbcdi_18_english_deidentified.csv 0 288
1151489_non_mbcdi_18_english_deidentified.csv 0 286
307736_non_mbcdi_18_english_deidentified.csv 4 274
331453_non_mbcdi_24_english_deidentified.csv 3 274
331848_non_mbcdi_12_english_deidentified.csv 4 267
334099_non_mbcdi_12_bilingual_english_deidentified.csv 1 267
363349_non_mbcdi_18_english_deidentified.csv 9 280
363381_non_mbcdi_24_english_deidentified.csv 8 280
363431_non_mbcdi_12_english_deidentified.csv 10 281
363465_non_mbcdi_24_bilingual_english_deidentified.csv 0 280
363466_non_mbcdi_18_bilingual_english_deidentified.csv 0 280
408149_non_mbcdi_24_bilingual_spanish_deidentified.csv 1 280
411388_non_mbcdi_18_bilingual_spanish_deidentified.csv 0 280
411456_non_mbcdi_12_bilingual_english_deidentified.csv 1 280
411469_non_mbcdi_12_bilingual_spanish_deidentified.csv 1 280
740623_non_mbcdi_12_bilingual_english_deidentified.csv 42 288
740624_non_mbcdi_12_bilingual_spanish_deidentified.csv 4 288
740625_non_mbcdi_12_english_deidentified.csv 220 288
740626_non_mbcdi_18_bilingual_english_deidentified.csv 61 287
740627_non_mbcdi_18_bilingual_spanish_deidentified.csv 7 287
740628_non_mbcdi_18_english_deidentified.csv 210 288
740629_non_mbcdi_24_english_deidentified.csv 174 287
740630_non_mbcdi_24_bilingual_spanish_deidentified.csv 3 287
740631_non_mbcdi_24_bilingual_english_deidentified.csv 45 287
740631_non_mbcdi_24_bilingual_spanish_deidentified.csv 26 287

The later forms (with higher form numbers–the leading integers in the file names) are the newer ones. These generally have the largest number of entries and have similar numbers of columns–either 287 or 288. Accordingly, we focus our cleaning efforts here first.

We start with the data files that have \(n=288\) columns.

df740623 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740623_non_mbcdi_12_bilingual_english_deidentified.csv",
    show_col_types = FALSE
  )

df740624 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740624_non_mbcdi_12_bilingual_spanish_deidentified.csv",
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740624))
## [1] 288
df740625 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740625_non_mbcdi_12_english_deidentified.csv",
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740625))
## [1] 288
df740628 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740628_non_mbcdi_18_english_deidentified.csv",
    show_col_types = FALSE
  )

sum(names(df740623) == names(df740628))
## [1] 288

So, four of the most recent data files with \(n=288\) columns can be aggregated without modification.

Let’s turn to the more recent files with \(n=287\) columns.

df740626 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740626_non_mbcdi_18_bilingual_english_deidentified.csv",
    show_col_types = FALSE
  )

df740627 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740627_non_mbcdi_18_bilingual_spanish_deidentified.csv",
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740627))
## [1] 100

Where does the misalignment arise?

names(df740626) == names(df740627)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [71]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [91]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [271] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [281]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

The misalignment arises somewhere near column 92.

df740629 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740629_non_mbcdi_24_english_deidentified.csv",
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740629))
## [1] 287

So, df740626 and df740629 are aligned and can be merged.

df740630 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740630_non_mbcdi_24_bilingual_spanish_deidentified.csv",
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740630))
## [1] 100
names(df740626) == names(df740630)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [71]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [91]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [271] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [281]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

These files also fall out of alignment near column 92.

df740631 <-
  readr::read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/740631_non_mbcdi_24_bilingual_english_deidentified.csv",
    show_col_types = FALSE
  )

sum(names(df740626) == names(df740631))
## [1] 100
names(df740626) == names(df740631)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [71]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [91]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [271] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [281]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

And these files fall out of alignment near column 92.

Let’s see if df740627, df740630, and df740631 are aligned with one another.

sum(names(df740627) == names(df740630))
## [1] 287
sum(names(df740627) == names(df740631))
## [1] 287

Yes, they are. So, these three can be merged. We do that first, then address the discrepancies between aggregates.

‘Older’ forms

The “older” forms have varied numbers of columns. We focus on thos with data (n_vars > 0)

df307736 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/307736_non_mbcdi_18_english_deidentified.csv",
    show_col_types = FALSE
  )
df331453 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/331453_non_mbcdi_24_english_deidentified.csv",
    show_col_types = FALSE
  )
df331848 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/331848_non_mbcdi_12_english_deidentified.csv",
    show_col_types = FALSE
  )
df334099 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/334099_non_mbcdi_12_bilingual_english_deidentified.csv",
    show_col_types = FALSE
  )
df363349 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/363349_non_mbcdi_18_english_deidentified.csv",
    show_col_types = FALSE
  )
df363381 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/363381_non_mbcdi_24_english_deidentified.csv",
    show_col_types = FALSE
  )
df363431 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/363431_non_mbcdi_12_english_deidentified.csv",
    show_col_types = FALSE
  )
df408149 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/408149_non_mbcdi_24_bilingual_spanish_deidentified.csv",
    show_col_types = FALSE
  )
df411456 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/411456_non_mbcdi_12_bilingual_english_deidentified.csv",
    show_col_types = FALSE
  )
df411469 <-
  read_csv(
    "../data/csv/home_visit/non_mbcdi/deid/411469_non_mbcdi_12_bilingual_spanish_deidentified.csv",
    show_col_types = FALSE
  )

Let’s look at the two forms that have the same number of columns, \(n=274\), 307736 and 331453.

names(df307736) == names(df331453)
##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [97] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [157] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [217] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [229] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
length(names(df307736) == names(df331453)) == length(names(df307736))
## [1] TRUE

So, these two are identical and could be merged.

tar_target(files_274_cols, stringr::str_detect(home_visit_remove_identifiers, "/(307736|331453)"))

tar_target(df_merge_274_cols, make_aggregate_data_file(home_visit_remove_identifiers[files_274_cols]))

How about the files with \(n=267\) columns, 331848 and 334099?

names(df331848) == names(df334099)
##   [1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [11] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [21] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [31] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [41] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [51] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [71] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [261]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
length(names(df331848) == names(df334099)) == length(names(df331848))
## [1] TRUE
names(df331848) |> head()
## [1] "group_combinedquestionnaires/participant_id"             
## [2] "start"                                                   
## [3] "end"                                                     
## [4] "group_combinedquestionnaires/note_fillthisoutbeforestudy"
## [5] "group_combinedquestionnaires/site_id"                    
## [6] "group_combinedquestionnaires/subject_number"
names(df334099) |> head()
## [1] "group_jo84c13/participant_id"             
## [2] "start"                                    
## [3] "end"                                      
## [4] "group_jo84c13/note_fillthisoutbeforestudy"
## [5] "group_jo84c13/site_id"                    
## [6] "group_jo84c13/subject_number"

There is an odd difference in the group label, group_combinedquestionnaires vs. group_jo84c13.

Let’s try deleting the initial group labels and compare again.

n1 <- names(df331848)
n2 <- names(df334099)

names(df331848) %>% stringr::str_remove("group_combinedquestionnaires/") |> head()
## [1] "participant_id"              "start"                      
## [3] "end"                         "note_fillthisoutbeforestudy"
## [5] "site_id"                     "subject_number"
names(df334099) %>% stringr::str_remove("group_jo84c13/") |> head()
## [1] "participant_id"              "start"                      
## [3] "end"                         "note_fillthisoutbeforestudy"
## [5] "site_id"                     "subject_number"

That looks promising.

names(df331848) %>% stringr::str_remove("group_combinedquestionnaires/") -> n1
names(df334099) %>% stringr::str_remove("group_jo84c13/") -> n2
n1 == n2
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
##  [11] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [21] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [31] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [41] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [51] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [71] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [221]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [261]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
cbind(n1[8:15], n2[8:15])
##      [,1]                     [,2]                    
## [1,] "test_date"              "test_date"             
## [2,] "child_sex"              "child_birth_date"      
## [3,] "age_group"              "child_sex"             
## [4,] "language_child"         "age_group"             
## [5,] "language_child/english" "language_child"        
## [6,] "language_child/spanish" "language_child/english"
## [7,] "language_instruction"   "language_child/spanish"
## [8,] "acknowledge_site"       "language_instruction"

n2 or df334099 has a child_birth_date field in position 9 that the other data frame does not have.

n1 |> str_detect("child_birth_date") |> sum()
## [1] 0

If we delete that variable, the data frames will no longer have the same number of columns. Let’s explore that anyway.

n2_2 <- n2[-9]

n1 == n2_2
## Warning in n1 == n2_2: longer object length is not a multiple of
## shorter object length
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
##  [31]  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE
##  [41] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [71] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

That helps a bit, but we diverge around column 29.

cbind(n1[28:51], n2_2[28:51])
##       [,1]                                                                                                
##  [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested"              
##  [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/birthhospital"
##  [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/afterhome"    
##  [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/no"           
##  [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/refused"      
##  [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/donotknow"    
##  [7,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested"               
##  [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/birthhospital" 
##  [9,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/afterhome"     
## [10,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/no"            
## [11,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/refused"       
## [12,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"     
## [13,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/allergies"       
## [14,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/ear_infection"   
## [15,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/asthma"          
## [16,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/respiratory"     
## [17,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/gastrointestinal"
## [18,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"              
## [19,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                
## [20,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                    
## [21,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"           
## [22,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                   
## [23,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"                           
## [24,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                       
##       [,2]                                                                                                                                  
##  [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested"                                                
##  [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/yes__in_the_bi"                                 
##  [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/yes__after_goi"                                 
##  [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/no"                                             
##  [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/refused"                                        
##  [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_hearing_tested/don_t_know"                                     
##  [7,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested"                                                 
##  [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/yes__in_the_bi"                                  
##  [9,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/yes__after_goi"                                  
## [10,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/no"                                              
## [11,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/refused"                                         
## [12,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/don_t_know"                                      
## [13,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/child_allergies_infections_ill_header"
## [14,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/allergies"                            
## [15,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/ear_infection"                        
## [16,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/asthma"                               
## [17,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/respiratory"                          
## [18,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/gastrointestinal"                     
## [19,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"                                                
## [20,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                                                  
## [21,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                                                      
## [22,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"                                             
## [23,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                                                     
## [24,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"

These question labels looks very similar. There are just some minor changes in the variable names. n2_2 has an extra variable in column 40.

n2_3 <- n2_2[-40]

Then, we can rename some of the columns in n2_3 using corresponding names from n1.

n2_3 |> stringr::str_replace("yes__in_the_bi", "birthhospital") |> stringr::str_replace("yes__after_goi", "afterhome") |> stringr::str_replace("don_t_know", "donotknow") -> n2_4

n1 == n2_4
## Warning in n1 == n2_4: longer object length is not a multiple of
## shorter object length
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [41] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [51] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [71] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
## [231]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
cbind(n1[39:51], n2_4[39:51])
##       [,1]                                                                                                
##  [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"     
##  [2,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/allergies"       
##  [3,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/ear_infection"   
##  [4,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/asthma"          
##  [5,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/respiratory"     
##  [6,] "group_homevisitquestionnaires/group_health/group_general_health/group_medicalprof/gastrointestinal"
##  [7,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"              
##  [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                
##  [9,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                    
## [10,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"           
## [11,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                   
## [12,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"                           
## [13,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                       
##       [,2]                                                                                                             
##  [1,] "group_homevisitquestionnaires/group_health/group_general_health/child_vision_tested/donotknow"                  
##  [2,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/allergies"       
##  [3,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/ear_infection"   
##  [4,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/asthma"          
##  [5,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/respiratory"     
##  [6,] "group_homevisitquestionnaires/group_health/group_general_health/child_allergies_infections_ill/gastrointestinal"
##  [7,] "group_homevisitquestionnaires/group_health/group_general_health/comments_allergy_etc"                           
##  [8,] "group_homevisitquestionnaires/group_health/group_general_health/child_injury_times"                             
##  [9,] "group_homevisitquestionnaires/group_health/group_general_health/comment_injury"                                 
## [10,] "group_homevisitquestionnaires/group_health/group_general_health/comments_general_health"                        
## [11,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"                                
## [12,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"                                    
## [13,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"

n1 has a group_medicalprof label from allergies through gastrointestinal; n2_4 has child_allergies_infections_ill for the same questions.

n2_4 |> stringr::str_replace("child_allergies_infections_ill", "group_medicalprof") -> n2_5
n1 == n2_5
## Warning in n1 == n2_5: longer object length is not a multiple of
## shorter object length
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [51] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [71] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
## [231]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
cbind(n1[49:60], n2_5[49:60])
##       [,1]                                                                             
##  [1,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"
##  [2,] "group_homevisitquestionnaires/group_health/group_prenatal/prenatal_care"        
##  [3,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"    
##  [4,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"      
##  [5,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_1"   
##  [6,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_2"   
##  [7,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_3"   
##  [8,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now"       
##  [9,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now_amount"
## [10,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_house"         
## [11,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_car"           
## [12,] "group_homevisitquestionnaires/group_health/group_smoking/comments_smoking"      
##       [,2]                                                                             
##  [1,] "group_homevisitquestionnaires/group_health/group_prenatal/instructions_prenatal"
##  [2,] "group_homevisitquestionnaires/group_health/group_prenatal/comments_prenatal"    
##  [3,] "group_homevisitquestionnaires/group_health/group_smoking/pregnant_smoking"      
##  [4,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_1"   
##  [5,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_2"   
##  [6,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_trimester_3"   
##  [7,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now"       
##  [8,] "group_homevisitquestionnaires/group_health/group_smoking/mom_smoking_now_amount"
##  [9,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_house"         
## [10,] "group_homevisitquestionnaires/group_health/group_smoking/smoking_car"           
## [11,] "group_homevisitquestionnaires/group_health/group_smoking/comments_smoking"      
## [12,] "group_homevisitquestionnaires/group_health/group_drinking/pregnant_drinking"

It looks like these could be reconciled by deleting prenatal_care from n1.

n1_2 <- n1[-50]
n1_2 == n2_5
## Warning in n1_2 == n2_5: longer object length is not a multiple
## of shorter object length
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [71] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [81] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [101] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE
cbind(n1_2[64:75], n2_5[64:75])
##       [,1]                                                                                                          
##  [1,] "group_homevisitquestionnaires/group_health/group_drinking/comments_drinking"                                 
##  [2,] "group_homevisitquestionnaires/group_health/group_phq4/note_phq4"                                             
##  [3,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_nervous"                           
##  [4,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_worrying"                          
##  [5,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_littleinterest"                    
##  [6,] "group_homevisitquestionnaires/group_health/group_phq4/group_phq4_001/phq4_down"                              
##  [7,] "group_homevisitquestionnaires/group_health/group_phq4/comments_phq4"                                         
##  [8,] "group_homevisitquestionnaires/group_rothbart/group_rothbartinstructions/rothbart_instructions1"              
##  [9,] "group_homevisitquestionnaires/group_rothbart/group_rothbartinstructions/rothbart_instructions2"              
## [10,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_unfamiliarperson"
## [11,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_troubletask"     
## [12,] "group_homevisitquestionnaires/group_rothbart/group_rothbart_001/rothbart_questions/rothbart_companyofchild"  
##       [,2]                                                                                                       
##  [1,] "group_homevisitquestionnaires/group_health/group_drinking/comments_drinking"                              
##  [2,] "group_homevisitquestionnaires/group_health/group_phq4/Experimenter_These_stions_are_about_you"            
##  [3,] "group_homevisitquestionnaires/group_health/group_phq4/comments_phq4"                                      
##  [4,] "group_homevisitquestionnaires/group_rothbart/group_dd2kz32/instructor_rothbart"                           
##  [5,] "group_homevisitquestionnaires/group_rothbart/group_dd2kz32/instructions_rothbart2"                        
##  [6,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/rothbart_questions_header"                
##  [7,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_approached_by_a_ld_cling_to_a_parent"
##  [8,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/While_having_trouble_get_easily_irritated"
##  [9,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_a_familiar_chil_company_of_the_child"
## [10,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/When_offered_a_choic_uickly_and_go_for_it"
## [11,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/During_daily_or_even_eing_quietly_sung_to"
## [12,] "group_homevisitquestionnaires/group_rothbart/rothbart_questions/While_playing_outdoo_and_excitement_of_it"

It looks like the phq4 is not in n2_5.

Let’s check.

n2_5 |> stringr::str_detect("phq4") |> sum()
## [1] 2

Yes, there are only two PHQ4-related questions in df334099.

df334099 |> names() |> stringr::str_detect("phq4") |> sum()
## [1] 2

This path of reconciliation does not appear fruitful.

Make aggregate files

non-MB-CDI files with \(n=288\) columns

tar_target(files_288_cols, 
  stringr::str_detect(home_visit_remove_identifiers, 
                      "2[3458]_non_mbcdi.*_deidentified\\.csv")
)

tar_target(df_merge_288_cols,
    make_aggregate_data_file(
        home_visit_remove_identifiers[files_288_cols])
)

non-MB-CDI files with \(n=287\) columns

tar_target(files_287_cols_1, 
  stringr::str_detect(home_visit_remove_identifiers, 
          "2[69]_non_mbcdi.*_deidentified\\.csv")),
tar_target(files_287_cols_2, 
  stringr::str_detect(home_visit_remove_identifiers, 
          "(740627|740630|740631)_non.*_deidentified\\.csv")),
tar_target(df_merge_287_cols_1,
  make_aggregate_data_file(
          home_visit_remove_identifiers[files_287_cols_1])),
tar_target(df_merge_287_cols_2,
  make_aggregate_data_file(
          home_visit_remove_identifiers[files_287_cols_2])),

Examine groups with \(n=287\) cols

We focus on the starting column where the column names diverge, column 92.

targets::tar_load(df_merge_287_cols_1, store="../_targets")
targets::tar_load(df_merge_287_cols_2, store="../_targets")
names(df_merge_287_cols_1)[92]
## [1] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_feeding_nutrition.instructions_feeding"
names(df_merge_287_cols_2)[92]
## [1] "group_combinedquestionnaires.group_homevisitquestionnaires.group_locomotor_milestones.group_health.group_feeding_nutrition.instructions_feeding"

There is an erroneous group_locomotor_milestones. in the df_merge_287_cols_2 column name.

A bit of sleuthing determines that this group_locomotor_milestones. label is characteristic of columns 92 to 273.

names(df_merge_287_cols_2)[92:273] |> stringr::str_detect(pattern = "group_locomotor_milestones")
##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [97] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [157] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE

The following should fix this.

old_names <- names(df_merge_287_cols_2)
new_names <- old_names
new_names[92:273] <-
  stringr::str_remove(new_names[92:273], "group_locomotor_milestones\\.")
names(df_merge_287_cols_2) <- new_names
names(df_merge_287_cols_2) == names(df_merge_287_cols_1)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [71]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [91]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [101]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [221]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [231]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [251]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [261]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [271]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [281]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

We have a second problem with columns from 114 to 210.

rbind(names(df_merge_287_cols_1)[113:115], names(df_merge_287_cols_2)[113:115])
##      [,1]                                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_vision_tested.donotknow"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_vision_tested.donotknow"
##      [,2]                                                                                                                                         
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
##      [,3]                                                                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"    
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"

One of the problems has to do with column 114. There is a question ending doctor_told_you in names(df_merge_287_cols_1) but not in names(df_merge_287_cols_2).

names(df_merge_287_cols_1) |> stringr::str_detect(pattern = "doctor_told_you") |> sum()
## [1] 1
names(df_merge_287_cols_2) |> stringr::str_detect(pattern = "doctor_told_you") |> sum()
## [1] 0

Deleting this question would create additional misalignments and further problems. We cannot proceed without further discussion with our team.

For now, let’s generate an array with all of the remaining differences in column names.

names_differ <- (names(df_merge_287_cols_2) != names(df_merge_287_cols_1))
sum(names_differ)
## [1] 103
rbind(names(df_merge_287_cols_1)[names_differ], names(df_merge_287_cols_2)[names_differ])
##      [,1]                                                                                                                                         
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
##      [,2]                                                                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"    
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
##      [,3]                                                                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.asthma"       
##      [,4]                                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.asthma"     
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.respiratory"
##      [,5]                                                                                                                                          
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.respiratory"     
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.gastrointestinal"
##      [,6]                                                                                                                                          
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.gastrointestinal"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_allergy_etc"                           
##      [,7]                                                                                                               
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_allergy_etc"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_injury_times"  
##      [,8]                                                                                                             
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_injury_times"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comment_injury"    
##      [,9]                                                                                                                  
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comment_injury"         
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_general_health"
##      [,10]                                                                                                                 
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.comments_general_health"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.instructions_prenatal"        
##      [,11]                                                                                                         
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.instructions_prenatal"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.prenatal_care"        
##      [,12]                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.prenatal_care"    
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.comments_prenatal"
##      [,13]                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_prenatal.comments_prenatal"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.pregnant_smoking"  
##      [,14]                                                                                                      
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.pregnant_smoking"   
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_1"
##      [,15]                                                                                                      
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_1"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_2"
##      [,16]                                                                                                      
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_2"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_3"
##      [,17]                                                                                                      
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_trimester_3"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now"    
##      [,18]                                                                                                         
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now"       
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now_amount"
##      [,19]                                                                                                         
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.mom_smoking_now_amount"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_house"         
##      [,20]                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_house"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_car"  
##      [,21]                                                                                                   
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.smoking_car"     
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.comments_smoking"
##      [,22]                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_smoking.comments_smoking"  
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.pregnant_drinking"
##      [,23]                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.pregnant_drinking"   
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_1"
##      [,24]                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_1"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_2"
##      [,25]                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_2"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_3"
##      [,26]                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.drinking_trimester_3"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.comments_drinking"   
##      [,27]                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_drinking.comments_drinking"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.note_phq4"            
##      [,28]                                                                                                 
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.note_phq4"        
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.instructions_phq4"
##      [,29]                                                                                                 
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.instructions_phq4"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_nervous"     
##      [,30]                                                                                             
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_nervous" 
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_worrying"
##      [,31]                                                                                                   
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_worrying"      
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_littleinterest"
##      [,32]                                                                                                   
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_littleinterest"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_down"          
##      [,33]                                                                                             
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.phq4_down"    
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.comments_phq4"
##      [,34]                                                                                             
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_phq4.comments_phq4"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart1"
##      [,35]                                                                                             
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart1"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart2"
##      [,36]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.instructions_rothbart2"                        
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.instructions_rothbart3"
##      [,37]                                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.instructions_rothbart3"   
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_unfamiliarperson"
##      [,38]                                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_unfamiliarperson"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_troubletask"     
##      [,39]                                                                                                                      
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_troubletask"   
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_companyofchild"
##      [,40]                                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_companyofchild"  
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_choiceactivities"
##      [,41]                                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_choiceactivities"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_quietlysung"     
##      [,42]                                                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_quietlysung"    
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_playingoutdoors"
##      [,43]                                                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_playingoutdoors"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_morethan10"     
##      [,44]                                                                                                                         
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_morethan10"       
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_respondingremarks"
##      [,45]                                                                                                                          
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_respondingremarks" 
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitedlovedadults"
##      [,46]                                                                                                                          
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitedlovedadults"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_fiddlehair"        
##      [,47]                                                                                                                  
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_fiddlehair"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_roughrowdy"
##      [,48]                                                                                                                    
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_roughrowdy"  
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedhugged"
##      [,49]                                                                                                                           
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedhugged"       
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_involvednewactivity"
##      [,50]                                                                                                                           
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_involvednewactivity"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tirequickly"        
##      [,51]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tirequickly"  
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_callattention"
##      [,52]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_callattention"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tags"         
##      [,53]                                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tags"            
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_noisyenvironment"
##      [,54]                                                                                                                        
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_noisyenvironment"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_energy"          
##      [,55]                                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_energy"  
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_vehicles"
##      [,56]                                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_vehicles"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_active"  
##      [,57]                                                                                                                 
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_active"   
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_forbidden"
##      [,58]                                                                                                                    
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_forbidden"   
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_sadlytearful"
##      [,59]                                                                                                                    
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_sadlytearful"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_downblue"    
##      [,60]                                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_downblue"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_runhouse"
##      [,61]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_runhouse"     
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitingevent"
##      [,62]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_excitingevent"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tempertantrum"
##      [,63]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_tempertantrum"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_waitpatiently"
##      [,64]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_waitpatiently"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedsmile"  
##      [,65]                                                                                                                   
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_rockedsmile"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_mold"       
##      [,66]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_mold"         
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_interactadult"
##      [,67]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_interactadult"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_careful"      
##      [,68]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_careful"      
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_enternewplace"
##      [,69]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_enternewplace"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_crymorethan3" 
##      [,70]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_crymorethan3" 
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_easilysoothed"
##      [,71]                                                                                                                     
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_easilysoothed"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_busyother"    
##      [,72]                                                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_busyother"      
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_differentpeople"
##      [,73]                                                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.group_rothbartquestions.rothbart_differentpeople"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.comments_rothbart"                               
##      [,74]                                                                                             
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_rothbart.comments_rothbart"     
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.mediause_instructions1"
##      [,75]                                                                                             
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.mediause_instructions1"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology"       
##      [,76]                                                                                         
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology"   
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.tv"
##      [,77]                                                                                          
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.tv" 
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.dvd"
##      [,78]                                                                                               
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.dvd"     
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.computer"
##      [,79]                                                                                               
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.computer"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.ipad"    
##      [,80]                                                                                                      
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.ipad"           
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.educationalgame"
##      [,81]                                                                                                      
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.educationalgame"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.videogame"      
##      [,82]                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.home_technology.videogame"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_tv"      
##      [,83]                                                                                          
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_tv"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_how"             
##      [,84]                                                                                           
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_how"              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_dvd"
##      [,85]                                                                                           
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_dvd"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.dvd_how"             
##      [,86]                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.dvd_how"                  
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_computer"
##      [,87]                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_computer"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.computer_how"             
##      [,88]                                                                                            
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.computer_how"         
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_ipad"
##      [,89]                                                                                            
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_ipad"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.ipad_how"             
##      [,90]                                                                                                   
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.ipad_how"                    
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_educational"
##      [,91]                                                                                                   
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_educational"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.educational_how"             
##      [,92]                                                                                                 
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.educational_how"           
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_videogame"
##      [,93]                                                                                                 
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.technology_child_videogame"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.videogame_how"             
##      [,94]                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.videogame_how"   
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_hours_per_day"
##      [,95]                                                                                            
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.tv_hours_per_day"     
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.note_tv_hours_per_day"
##      [,96]                                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.note_tv_hours_per_day"                    
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.instructions_technology_use"
##      [,97]                                                                                                                
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.instructions_technology_use"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.technology_use_scale"       
##      [,98]                                                                                                  
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typicalday" 
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typical_day"
##      [,99]                                                                                            
## [1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel1"                              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel1"
##      [,100]                                                                                           
## [1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel2"                              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel2"
##      [,101]                                                                                           
## [1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel3"                              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel3"
##      [,102]                                                                                            
## [1,] "group_combinedquestionnaires.group_databrary.acknowledge_databrary"                              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.acknowledge_databrary"
##      [,103]                                                                                       
## [1,] "group_combinedquestionnaires.group_databrary.note_saveasdraft"                              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_saveasdraft"

Visual inspection suggests that these are similar with the following deviations:

  • As noted, df_merge_287_cols_1 has a column ending doctor_told_you that is not present in df_merge_287_cols_2.
  • df_merge_287_cols_2 has a column ending technology_use_scale that is not present in the df_merge_287_cols_1
  • There are a set of fields in group_databrary that do not align exactly. We will almost certainly delete these, so the misalignment is not a huge problem.

As an exploration, let’s see if we can reconcile these by deleting the non-aligning columns.

df1 <- df_merge_287_cols_1
df2 <- df_merge_287_cols_2

df1 <- df1 %>%
  dplyr::select(., -contains('doctor_told_you'))

df2 <- df2 %>%
  dplyr::select(., -contains('technology_use_scale'))

old_names <- names(df2)
new_names <- old_names
new_names[92:273] <- stringr::str_remove(new_names[92:273], "group_locomotor_milestones\\.")
names(df2) <- new_names

names(df1) == names(df2)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [71]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [91]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [101]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [131]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [141]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [151]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [161]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [171]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [191]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [201]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [211]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [221]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [231]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [251]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [261]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [271]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
## [281]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

This looks promising.

rbind(names(df1)[263], names(df2)[263])
##      [,1]                                                                                                   
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typicalday" 
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_typical_day.instructions_typical_day"

This is easily fixed.

names(df1)[263] <- names(df2)[263]
rbind(names(df1)[273:275], names(df2)[273:275])
##      [,1]                                                                                             
## [1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel1"                              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel1"
##      [,2]                                                                                             
## [1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel2"                              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel2"
##      [,3]                                                                                             
## [1,] "group_combinedquestionnaires.group_databrary.note_databraryspiel3"                              
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_databrary.note_databraryspiel3"

The last misalignments relate to Databrary fields.

df1 <- df1 %>%
  dplyr::select(., -contains('group_databrary'))

df2 <- df2 %>%
  dplyr::select(., -contains('group_databrary'))

names(df1) == names(df2)
##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [97] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [157] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [217] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [229] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [241] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [277] TRUE TRUE TRUE TRUE TRUE

Success!

Combining the two groups of datasets

Now, let’s go back to the data frame with 288 cols and see if we can bring these into alignment.

targets::tar_load(df_merge_288_cols, store="../_targets")

df3 <- df_merge_288_cols

df3 <- df3 %>%
  dplyr::select(., -contains('group_databrary'))

c(dim(df1), dim(df2), dim(df3))
## [1] 235 281  55 281 476 283
names(df1) == names(df3)
## Warning in names(df1) == names(df3): longer object length is not
## a multiple of shorter object length
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [71]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [91]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [101]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [131] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [151] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [161] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [171] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [191] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [271] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [281] FALSE FALSE FALSE
rbind(names(df1)[114:115], names(df3)[114:115])
##      [,1]                                                                                                                                         
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"      
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.doctor_told_you"
##      [,2]                                                                                                                                       
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.ear_infection"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_health.group_general_health.child_allergies_infections_ill.allergies"

Once again, there appears to be a problem with the ‘doctor_told_you’ field. We’ll delete it to see if this fixes one of the problems.

df3 <- df3 %>%
  dplyr::select(., -contains('doctor_told_you'))

names(df1) == names(df3)
## Warning in names(df1) == names(df3): longer object length is not
## a multiple of shorter object length
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [71]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [91]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [101]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [131]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [141]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [151]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [161]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [171]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [191]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [201]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [211] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [231] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [251] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [271] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [281] FALSE FALSE

We still have misalignments at column 210.

rbind(names(df1)[210:213], names(df3)[210:213])
##      [,1]                                                                                                          
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.meals"               
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.technology_use_scale"
##      [,2]                                                                                              
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.playtime"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.meals"   
##      [,3]                                                                                              
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.bedtime" 
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.playtime"
##      [,4]                                                                                                    
## [1,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.transportation"
## [2,] "group_combinedquestionnaires.group_homevisitquestionnaires.group_mediause.group_techuse.bedtime"

The ’technology_use_scale` exists in one but not the other.

df3 <- df3 %>%
  dplyr::select(., -contains('technology_use_scale'))

rbind(dim(df1), dim(df3))
##      [,1] [,2]
## [1,]  235  281
## [2,]  476  281
names(df1) == names(df3)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [11]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [21]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [31]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [41]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [51]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [71]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [81]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [91]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [101]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [131]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [141]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [151]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [161]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [171]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [181]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [191]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [201]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [211]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [221]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [231]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [241]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [251]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [261]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [271]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [281]  TRUE

Future versions of the workflow will need to handle this more elegantly.

Option 1: Fix the underlying forms.

Option 2: Add the ‘missing’ columns as NA in post-processing.

For now, I’m going to create functions that align these data frames. These are incorporated into R/functions.R so we do not source them again here.

remove_technology_use_scale <- function(df) {
  dplyr::select(df, -contains('technology_use_scale'))
}

remove_doctor_told_you <- function(df) {
  dplyr::select(df, -contains('doctor_told_you'))
}

remove_databrary_fields <- function(df) {
  dplyr::select(df, -contains('group_databrary'))
}

reconcile_typicalday <- function(df) {
  names(df) <- stringr::str_replace_all(names(df), 'typicalday', 'typical_day')
  df
}

remove_permissive_locomotor_milestones_label <- function(df) {
  old_names <- names(df)
  new_names <- old_names
  contains_locomotor <-
    stringr::str_detect(new_names, pattern = "locomotor_milestones.*health|division|rothbart|mediause|pets|typical|acknowledge")
  new_names[contains_locomotor] <-
    stringr::str_remove(new_names[contains_locomotor], "group_locomotor_milestones\\.")
  names(df) <- new_names
  df
}

remove_X_meta_cols <- function(df) {
  dplyr::select(df, -contains("X_"), -contains("meta.instanceID"))
}

remove_redundant_group_labels <- function(df) {
  names(df) <- stringr::str_remove_all(names(df), 'group_homevisitquestionnaires\\.')
  names(df) <- stringr::str_remove_all(names(df), 'group_combinedquestionnaires\\.')
  df
}

clean_dfs <- function(df) {
  df %>%
    reconcile_typicalday() %>%
    remove_technology_use_scale() %>%
    remove_doctor_told_you() %>%
    remove_permissive_locomotor_milestones_label() %>%
    remove_databrary_fields() %>%
    remove_X_meta_cols() %>%
    remove_redundant_group_labels()
}

Let’s test this workflow with the unmodified files.

targets::tar_load(df_merge_287_cols_1, store="../_targets")
targets::tar_load(df_merge_287_cols_2, store="../_targets")
targets::tar_load(df_merge_288_cols, store="../_targets")

df1m <- clean_dfs(df_merge_287_cols_1)
dim(df1m)
## [1] 235 272
df2m <- clean_dfs(df_merge_287_cols_2)
dim(df2m)
## [1]  55 272
df3m <- clean_dfs(df_merge_288_cols)
dim(df3m)
## [1] 476 272
(names(df1m) == names(df2m)) |> sum()
## [1] 272
(names(df1m) == names(df3m)) |> sum()
## [1] 272

Merging files at last

df <- rbind(df1m, df2m, df3m)

As of 2022-12-15, the above has now been incorporated into R/functions.R and into the _targets.R workflow.

Merge with Databrary info

For each session (row) in the merged data frame, we pull data from the associated Databrary volume and session. These data are merged with that drawn from KBT.

tar_target(
  home_visit_w_databrary_df,
  add_databrary_info_to_home_visit_df(home_visit_df)
)

The add_databrary_info_to_home_visit() function in R/functions.R does most of the work.

Clean and prepare for export