Vertical data merger with different columns.

I have 3 datasets with me in different files. These are mainly on the academic scores of students of different grades. For our research purposes, we need to merge these files vertically. The similar questions across the 3 levels will appear vertically one above the other. For example, the l1c1_score, l2c1_score and l3c1_score will be grouped together and then first grade students will appear first, then the second and the third grade will appear last. To again make it clear, the merger is vertical. The challenge I am facing is due to the difference in variable names and the number of columns is different for different levels.

library(tidyverse)

data_lvl1<-tibble::tribble(
  ~student_id, ~l1c1_score, ~l1c2_score, ~l1c3_score, ~l1c4_score, ~cog_total_l1,
       "S001",          2L,          2L,          2L,          1L,            7L,
       "S002",          2L,          1L,          2L,          2L,            7L,
       "S003",          4L,          1L,          2L,          3L,           10L,
       "S004",          6L,          1L,          1L,          2L,           10L,
       "S005",          2L,          1L,          1L,          3L,            7L,
       "S006",          2L,          2L,          1L,          3L,            8L,
       "S007",          2L,          2L,          1L,          4L,            9L,
       "S008",          2L,          2L,          1L,          2L,            7L,
       "S009",          2L,          2L,          2L,          2L,            8L,
       "S010",          4L,          1L,          2L,          4L,           11L,
       "S011",          4L,          1L,          3L,          4L,           12L,
       "S012",          4L,          1L,          4L,          2L,           11L,
       "S013",          4L,          1L,          4L,          2L,           11L,
       "S014",          3L,          1L,          4L,          4L,           12L,
       "S015",          2L,          1L,          1L,          2L,            6L
  )

data_lvl2<-tibble::tribble(
  ~student_id, ~l2c1_score, ~l2c2_score, ~l2c3_score, ~l2c4_score, ~l2c5_score, ~cog_total_l2,
       "S016",          2L,          2L,          1L,          2L,          2L,            9L,
       "S017",          2L,          2L,          2L,          2L,          1L,            9L,
       "S018",          2L,          2L,          2L,          2L,          1L,            9L,
       "S019",          2L,          4L,          3L,          2L,          1L,           12L,
       "S020",          4L,          4L,          3L,          2L,          1L,           14L,
       "S021",          4L,          4L,          1L,          2L,          1L,           12L,
       "S022",          4L,          2L,          1L,          2L,          2L,           11L,
       "S023",          5L,          2L,          1L,          3L,          2L,           13L,
       "S024",          5L,          2L,          1L,          3L,          2L,           13L,
       "S025",          2L,          4L,          2L,          3L,          1L,           12L,
       "S026",          5L,          4L,          3L,          4L,          2L,           18L,
       "S027",          5L,          4L,          3L,          4L,          1L,           17L,
       "S028",          1L,          1L,          1L,          4L,          1L,            8L,
       "S029",          1L,          1L,          1L,          2L,          3L,            8L,
       "S030",          1L,          1L,          1L,          2L,          3L,            8L
  )

data_lvl3<-tibble::tribble(
  ~student_id, ~l2c1_score, ~l2c2_score, ~l2c3_score, ~l2c4_score, ~l2c5_score, ~l2c6_score, ~cog_total_l1,
       "S031",          4L,          1L,          3L,          1L,          3L,          3L,           15L,
       "S032",          4L,          3L,          2L,          6L,          0L,          4L,           19L,
       "S033",          4L,          4L,          0L,          6L,          5L,          4L,           23L,
       "S034",          4L,          1L,          0L,          6L,          4L,          2L,           17L,
       "S035",          4L,          6L,          2L,          3L,          5L,          2L,           22L,
       "S036",          2L,          0L,          5L,          2L,          1L,          2L,           12L,
       "S037",          2L,          5L,          4L,          3L,          4L,          2L,           20L,
       "S038",          2L,          2L,          6L,          3L,          3L,          2L,           18L,
       "S039",          2L,          5L,          0L,          4L,          0L,          2L,           13L,
       "S040",          6L,          3L,          0L,          5L,          2L,          2L,           18L,
       "S041",          6L,          6L,          0L,          5L,          1L,          2L,           20L,
       "S042",          6L,          0L,          6L,          4L,          3L,          2L,           21L,
       "S043",          6L,          6L,          3L,          4L,          2L,          2L,           23L,
       "S044",          2L,          0L,          6L,          5L,          1L,          4L,           18L,
       "S045",          2L,          5L,          0L,          0L,          6L,          4L,           17L
  )
Created on 2022-03-31 by the reprex package (v2.0.1)

If I'm understanding your problem correctly, you probably want to:

  1. Rename the columns in l2c1_score to match data_lvl1
  2. Create the column l2c5_score in data_lvl1 with NA values
  3. Use rbind() on data_lvl1 with data_lvl2

To make the code simplest to understand:

Step 1:

l2c1_renamed = l2c1_score |>
  rename(l1c1_score = l2c1_score,
         l1c2_score = l2c2_score,
         ...)

Step 2:

data_lvl1_mutated = data_lvl1 |>
  mutate(l2c5_score = NA)

Step 3:

data_lvl1_mutated |>
  rbind(l2c1_renamed)

Then repeat for the third dataframe

Hi. Thanks a lot. You understood my problem precisely. The actual dataset is very big with large number of columns. So as per my requirement, I will have to introduce NAs and merge.

The code does not seem to work as intended. Or am I making a mistake?

library(tidyverse)

data_lvl1<-tibble::tribble(
  ~student_id, ~l1c1_score, ~l1c2_score, ~l1c3_score, ~l1c4_score, ~cog_total_l1,
  "S001",          2L,          2L,          2L,          1L,            7L,
  "S002",          2L,          1L,          2L,          2L,            7L,
  "S003",          4L,          1L,          2L,          3L,           10L,
  "S004",          6L,          1L,          1L,          2L,           10L,
  "S005",          2L,          1L,          1L,          3L,            7L,
  "S006",          2L,          2L,          1L,          3L,            8L,
  "S007",          2L,          2L,          1L,          4L,            9L,
  "S008",          2L,          2L,          1L,          2L,            7L,
  "S009",          2L,          2L,          2L,          2L,            8L,
  "S010",          4L,          1L,          2L,          4L,           11L,
  "S011",          4L,          1L,          3L,          4L,           12L,
  "S012",          4L,          1L,          4L,          2L,           11L,
  "S013",          4L,          1L,          4L,          2L,           11L,
  "S014",          3L,          1L,          4L,          4L,           12L,
  "S015",          2L,          1L,          1L,          2L,            6L
)

data_lvl2<-tibble::tribble(
  ~student_id, ~l2c1_score, ~l2c2_score, ~l2c3_score, ~l2c4_score, ~l2c5_score, ~cog_total_l2,
  "S016",          2L,          2L,          1L,          2L,          2L,            9L,
  "S017",          2L,          2L,          2L,          2L,          1L,            9L,
  "S018",          2L,          2L,          2L,          2L,          1L,            9L,
  "S019",          2L,          4L,          3L,          2L,          1L,           12L,
  "S020",          4L,          4L,          3L,          2L,          1L,           14L,
  "S021",          4L,          4L,          1L,          2L,          1L,           12L,
  "S022",          4L,          2L,          1L,          2L,          2L,           11L,
  "S023",          5L,          2L,          1L,          3L,          2L,           13L,
  "S024",          5L,          2L,          1L,          3L,          2L,           13L,
  "S025",          2L,          4L,          2L,          3L,          1L,           12L,
  "S026",          5L,          4L,          3L,          4L,          2L,           18L,
  "S027",          5L,          4L,          3L,          4L,          1L,           17L,
  "S028",          1L,          1L,          1L,          4L,          1L,            8L,
  "S029",          1L,          1L,          1L,          2L,          3L,            8L,
  "S030",          1L,          1L,          1L,          2L,          3L,            8L
)

l2c1_renamed <-l2c1_score |>
  rename(l1c1_score = l2c1_score,
         l1c2_score = l2c2_score)
#> Error in rename(l2c1_score, l1c1_score = l2c1_score, l1c2_score = l2c2_score): object 'l2c1_score' not found
data_lvl1_mutated = data_lvl1 |>
  mutate(l2c5_score = NA)

data_lvl1_mutated |>
  rbind(l2c1_renamed)
#> Error in rbind(deparse.level, ...): object 'l2c1_renamed' not found
Created on 2022-03-31 by the reprex package (v2.0.1)

Oh oops
It should be
l2c1_renamed = data_lvl2 |>

If your data is large, you might want tidytable instead, and using rename.() instead.

The code is working now. Thanks a lot.. This was a great help.

Regards,
NP

This is another way that can handle all three data frames at once, and also, another approach that retrieves the data that is encoded into the column names, so you don't lose it, and reshapes your data into a long format so you can more easily work with it.

library(tidyverse)

data_lvl1<-tibble::tribble(
    ~student_id, ~l1c1_score, ~l1c2_score, ~l1c3_score, ~l1c4_score, ~cog_total_l1,
    "S001",          2L,          2L,          2L,          1L,            7L,
    "S002",          2L,          1L,          2L,          2L,            7L,
    "S003",          4L,          1L,          2L,          3L,           10L,
    "S004",          6L,          1L,          1L,          2L,           10L,
    "S005",          2L,          1L,          1L,          3L,            7L,
    "S006",          2L,          2L,          1L,          3L,            8L,
    "S007",          2L,          2L,          1L,          4L,            9L,
    "S008",          2L,          2L,          1L,          2L,            7L,
    "S009",          2L,          2L,          2L,          2L,            8L,
    "S010",          4L,          1L,          2L,          4L,           11L,
    "S011",          4L,          1L,          3L,          4L,           12L,
    "S012",          4L,          1L,          4L,          2L,           11L,
    "S013",          4L,          1L,          4L,          2L,           11L,
    "S014",          3L,          1L,          4L,          4L,           12L,
    "S015",          2L,          1L,          1L,          2L,            6L
)

data_lvl2<-tibble::tribble(
    ~student_id, ~l2c1_score, ~l2c2_score, ~l2c3_score, ~l2c4_score, ~l2c5_score, ~cog_total_l2,
    "S016",          2L,          2L,          1L,          2L,          2L,            9L,
    "S017",          2L,          2L,          2L,          2L,          1L,            9L,
    "S018",          2L,          2L,          2L,          2L,          1L,            9L,
    "S019",          2L,          4L,          3L,          2L,          1L,           12L,
    "S020",          4L,          4L,          3L,          2L,          1L,           14L,
    "S021",          4L,          4L,          1L,          2L,          1L,           12L,
    "S022",          4L,          2L,          1L,          2L,          2L,           11L,
    "S023",          5L,          2L,          1L,          3L,          2L,           13L,
    "S024",          5L,          2L,          1L,          3L,          2L,           13L,
    "S025",          2L,          4L,          2L,          3L,          1L,           12L,
    "S026",          5L,          4L,          3L,          4L,          2L,           18L,
    "S027",          5L,          4L,          3L,          4L,          1L,           17L,
    "S028",          1L,          1L,          1L,          4L,          1L,            8L,
    "S029",          1L,          1L,          1L,          2L,          3L,            8L,
    "S030",          1L,          1L,          1L,          2L,          3L,            8L
)

data_lvl3<-tibble::tribble(
    ~student_id, ~l3c1_score, ~l3c2_score, ~l3c3_score, ~l3c4_score, ~l3c5_score, ~l3c6_score, ~cog_total_l3,
    "S031",          4L,          1L,          3L,          1L,          3L,          3L,           15L,
    "S032",          4L,          3L,          2L,          6L,          0L,          4L,           19L,
    "S033",          4L,          4L,          0L,          6L,          5L,          4L,           23L,
    "S034",          4L,          1L,          0L,          6L,          4L,          2L,           17L,
    "S035",          4L,          6L,          2L,          3L,          5L,          2L,           22L,
    "S036",          2L,          0L,          5L,          2L,          1L,          2L,           12L,
    "S037",          2L,          5L,          4L,          3L,          4L,          2L,           20L,
    "S038",          2L,          2L,          6L,          3L,          3L,          2L,           18L,
    "S039",          2L,          5L,          0L,          4L,          0L,          2L,           13L,
    "S040",          6L,          3L,          0L,          5L,          2L,          2L,           18L,
    "S041",          6L,          6L,          0L,          5L,          1L,          2L,           20L,
    "S042",          6L,          0L,          6L,          4L,          3L,          2L,           21L,
    "S043",          6L,          6L,          3L,          4L,          2L,          2L,           23L,
    "S044",          2L,          0L,          6L,          5L,          1L,          4L,           18L,
    "S045",          2L,          5L,          0L,          0L,          6L,          4L,           17L
)

data_frame_list <- ls(pattern = "^data_lvl\\d$")

map_dfr(.x = data_frame_list,
        .f = ~{
            get(.x) %>% 
                set_names(str_remove(names(get(.x)), "_?l\\d"))
        })
#> # A tibble: 45 × 8
#>    student_id c1_score c2_score c3_score c4_score cog_total c5_score c6_score
#>    <chr>         <int>    <int>    <int>    <int>     <int>    <int>    <int>
#>  1 S001              2        2        2        1         7       NA       NA
#>  2 S002              2        1        2        2         7       NA       NA
#>  3 S003              4        1        2        3        10       NA       NA
#>  4 S004              6        1        1        2        10       NA       NA
#>  5 S005              2        1        1        3         7       NA       NA
#>  6 S006              2        2        1        3         8       NA       NA
#>  7 S007              2        2        1        4         9       NA       NA
#>  8 S008              2        2        1        2         7       NA       NA
#>  9 S009              2        2        2        2         8       NA       NA
#> 10 S010              4        1        2        4        11       NA       NA
#> # … with 35 more rows

# Retrive data from column names and reshape into long format
map_dfr(.x = data_frame_list,
        .f = ~{
            get(.x) %>%
                select(-starts_with("cog_total_")) %>% 
                pivot_longer(-student_id, names_to = c("level", "course"), names_pattern = "(l\\d)(c\\d)", values_to = "score")
        })
#> # A tibble: 225 × 4
#>    student_id level course score
#>    <chr>      <chr> <chr>  <int>
#>  1 S001       l1    c1         2
#>  2 S001       l1    c2         2
#>  3 S001       l1    c3         2
#>  4 S001       l1    c4         1
#>  5 S002       l1    c1         2
#>  6 S002       l1    c2         1
#>  7 S002       l1    c3         2
#>  8 S002       l1    c4         2
#>  9 S003       l1    c1         4
#> 10 S003       l1    c2         1
#> # … with 215 more rows

Created on 2022-03-31 by the reprex package (v2.0.1)

this code is far efficient. But with my dataset, I don't think this will be the apt solution. One major problem is the question number coding. For level-1, there will be one question l1c5 which will have to be grouped with l2c4 and l3c4. this situation comes again and again in our survey.

Then I would go with the second option I gave you since regrouping would be way easier and it gives you the opportunity to retain all the original data (including data encoded in the column names) for future use.

ok thanks a lot for this

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.