Vertical data merger with different columns.

kuttan98 · March 31, 2022, 4:54am

I have 3 datasets with me in different files. These are mainly on the academic scores of students of different grades. For our research purposes, we need to merge these files vertically. The similar questions across the 3 levels will appear vertically one above the other. For example, the l1c1_score, l2c1_score and l3c1_score will be grouped together and then first grade students will appear first, then the second and the third grade will appear last. To again make it clear, the merger is vertical. The challenge I am facing is due to the difference in variable names and the number of columns is different for different levels.

library(tidyverse)

data_lvl1<-tibble::tribble(
  ~student_id, ~l1c1_score, ~l1c2_score, ~l1c3_score, ~l1c4_score, ~cog_total_l1,
       "S001",          2L,          2L,          2L,          1L,            7L,
       "S002",          2L,          1L,          2L,          2L,            7L,
       "S003",          4L,          1L,          2L,          3L,           10L,
       "S004",          6L,          1L,          1L,          2L,           10L,
       "S005",          2L,          1L,          1L,          3L,            7L,
       "S006",          2L,          2L,          1L,          3L,            8L,
       "S007",          2L,          2L,          1L,          4L,            9L,
       "S008",          2L,          2L,          1L,          2L,            7L,
       "S009",          2L,          2L,          2L,          2L,            8L,
       "S010",          4L,          1L,          2L,          4L,           11L,
       "S011",          4L,          1L,          3L,          4L,           12L,
       "S012",          4L,          1L,          4L,          2L,           11L,
       "S013",          4L,          1L,          4L,          2L,           11L,
       "S014",          3L,          1L,          4L,          4L,           12L,
       "S015",          2L,          1L,          1L,          2L,            6L
  )

data_lvl2<-tibble::tribble(
  ~student_id, ~l2c1_score, ~l2c2_score, ~l2c3_score, ~l2c4_score, ~l2c5_score, ~cog_total_l2,
       "S016",          2L,          2L,          1L,          2L,          2L,            9L,
       "S017",          2L,          2L,          2L,          2L,          1L,            9L,
       "S018",          2L,          2L,          2L,          2L,          1L,            9L,
       "S019",          2L,          4L,          3L,          2L,          1L,           12L,
       "S020",          4L,          4L,          3L,          2L,          1L,           14L,
       "S021",          4L,          4L,          1L,          2L,          1L,           12L,
       "S022",          4L,          2L,          1L,          2L,          2L,           11L,
       "S023",          5L,          2L,          1L,          3L,          2L,           13L,
       "S024",          5L,          2L,          1L,          3L,          2L,           13L,
       "S025",          2L,          4L,          2L,          3L,          1L,           12L,
       "S026",          5L,          4L,          3L,          4L,          2L,           18L,
       "S027",          5L,          4L,          3L,          4L,          1L,           17L,
       "S028",          1L,          1L,          1L,          4L,          1L,            8L,
       "S029",          1L,          1L,          1L,          2L,          3L,            8L,
       "S030",          1L,          1L,          1L,          2L,          3L,            8L
  )

data_lvl3<-tibble::tribble(
  ~student_id, ~l2c1_score, ~l2c2_score, ~l2c3_score, ~l2c4_score, ~l2c5_score, ~l2c6_score, ~cog_total_l1,
       "S031",          4L,          1L,          3L,          1L,          3L,          3L,           15L,
       "S032",          4L,          3L,          2L,          6L,          0L,          4L,           19L,
       "S033",          4L,          4L,          0L,          6L,          5L,          4L,           23L,
       "S034",          4L,          1L,          0L,          6L,          4L,          2L,           17L,
       "S035",          4L,          6L,          2L,          3L,          5L,          2L,           22L,
       "S036",          2L,          0L,          5L,          2L,          1L,          2L,           12L,
       "S037",          2L,          5L,          4L,          3L,          4L,          2L,           20L,
       "S038",          2L,          2L,          6L,          3L,          3L,          2L,           18L,
       "S039",          2L,          5L,          0L,          4L,          0L,          2L,           13L,
       "S040",          6L,          3L,          0L,          5L,          2L,          2L,           18L,
       "S041",          6L,          6L,          0L,          5L,          1L,          2L,           20L,
       "S042",          6L,          0L,          6L,          4L,          3L,          2L,           21L,
       "S043",          6L,          6L,          3L,          4L,          2L,          2L,           23L,
       "S044",          2L,          0L,          6L,          5L,          1L,          4L,           18L,
       "S045",          2L,          5L,          0L,          0L,          6L,          4L,           17L
  )
Created on 2022-03-31 by the reprex package (v2.0.1)

pathos · March 31, 2022, 7:19am

If I'm understanding your problem correctly, you probably want to:

Rename the columns in l2c1_score to match data_lvl1
Create the column l2c5_score in data_lvl1 with NA values
Use rbind() on data_lvl1 with data_lvl2

To make the code simplest to understand:

Step 1:

l2c1_renamed = l2c1_score |>
  rename(l1c1_score = l2c1_score,
         l1c2_score = l2c2_score,
         ...)

Step 2:

data_lvl1_mutated = data_lvl1 |>
  mutate(l2c5_score = NA)

Step 3:

data_lvl1_mutated |>
  rbind(l2c1_renamed)

Then repeat for the third dataframe

kuttan98 · March 31, 2022, 11:09am

Hi. Thanks a lot. You understood my problem precisely. The actual dataset is very big with large number of columns. So as per my requirement, I will have to introduce NAs and merge.

kuttan98 · March 31, 2022, 12:25pm

The code does not seem to work as intended. Or am I making a mistake?

library(tidyverse)

data_lvl1<-tibble::tribble(
  ~student_id, ~l1c1_score, ~l1c2_score, ~l1c3_score, ~l1c4_score, ~cog_total_l1,
  "S001",          2L,          2L,          2L,          1L,            7L,
  "S002",          2L,          1L,          2L,          2L,            7L,
  "S003",          4L,          1L,          2L,          3L,           10L,
  "S004",          6L,          1L,          1L,          2L,           10L,
  "S005",          2L,          1L,          1L,          3L,            7L,
  "S006",          2L,          2L,          1L,          3L,            8L,
  "S007",          2L,          2L,          1L,          4L,            9L,
  "S008",          2L,          2L,          1L,          2L,            7L,
  "S009",          2L,          2L,          2L,          2L,            8L,
  "S010",          4L,          1L,          2L,          4L,           11L,
  "S011",          4L,          1L,          3L,          4L,           12L,
  "S012",          4L,          1L,          4L,          2L,           11L,
  "S013",          4L,          1L,          4L,          2L,           11L,
  "S014",          3L,          1L,          4L,          4L,           12L,
  "S015",          2L,          1L,          1L,          2L,            6L
)

data_lvl2<-tibble::tribble(
  ~student_id, ~l2c1_score, ~l2c2_score, ~l2c3_score, ~l2c4_score, ~l2c5_score, ~cog_total_l2,
  "S016",          2L,          2L,          1L,          2L,          2L,            9L,
  "S017",          2L,          2L,          2L,          2L,          1L,            9L,
  "S018",          2L,          2L,          2L,          2L,          1L,            9L,
  "S019",          2L,          4L,          3L,          2L,          1L,           12L,
  "S020",          4L,          4L,          3L,          2L,          1L,           14L,
  "S021",          4L,          4L,          1L,          2L,          1L,           12L,
  "S022",          4L,          2L,          1L,          2L,          2L,           11L,
  "S023",          5L,          2L,          1L,          3L,          2L,           13L,
  "S024",          5L,          2L,          1L,          3L,          2L,           13L,
  "S025",          2L,          4L,          2L,          3L,          1L,           12L,
  "S026",          5L,          4L,          3L,          4L,          2L,           18L,
  "S027",          5L,          4L,          3L,          4L,          1L,           17L,
  "S028",          1L,          1L,          1L,          4L,          1L,            8L,
  "S029",          1L,          1L,          1L,          2L,          3L,            8L,
  "S030",          1L,          1L,          1L,          2L,          3L,            8L
)

l2c1_renamed <-l2c1_score |>
  rename(l1c1_score = l2c1_score,
         l1c2_score = l2c2_score)
#> Error in rename(l2c1_score, l1c1_score = l2c1_score, l1c2_score = l2c2_score): object 'l2c1_score' not found
data_lvl1_mutated = data_lvl1 |>
  mutate(l2c5_score = NA)

data_lvl1_mutated |>
  rbind(l2c1_renamed)
#> Error in rbind(deparse.level, ...): object 'l2c1_renamed' not found
Created on 2022-03-31 by the reprex package (v2.0.1)

pathos · March 31, 2022, 2:19pm

Oh oops
It should be
l2c1_renamed = data_lvl2 |>

If your data is large, you might want tidytable instead, and using rename.() instead.

kuttan98 · March 31, 2022, 3:10pm

The code is working now. Thanks a lot.. This was a great help.

Regards,
NP

andresrcs · March 31, 2022, 9:44pm

This is another way that can handle all three data frames at once, and also, another approach that retrieves the data that is encoded into the column names, so you don't lose it, and reshapes your data into a long format so you can more easily work with it.

library(tidyverse)

data_lvl1<-tibble::tribble(
    ~student_id, ~l1c1_score, ~l1c2_score, ~l1c3_score, ~l1c4_score, ~cog_total_l1,
    "S001",          2L,          2L,          2L,          1L,            7L,
    "S002",          2L,          1L,          2L,          2L,            7L,
    "S003",          4L,          1L,          2L,          3L,           10L,
    "S004",          6L,          1L,          1L,          2L,           10L,
    "S005",          2L,          1L,          1L,          3L,            7L,
    "S006",          2L,          2L,          1L,          3L,            8L,
    "S007",          2L,          2L,          1L,          4L,            9L,
    "S008",          2L,          2L,          1L,          2L,            7L,
    "S009",          2L,          2L,          2L,          2L,            8L,
    "S010",          4L,          1L,          2L,          4L,           11L,
    "S011",          4L,          1L,          3L,          4L,           12L,
    "S012",          4L,          1L,          4L,          2L,           11L,
    "S013",          4L,          1L,          4L,          2L,           11L,
    "S014",          3L,          1L,          4L,          4L,           12L,
    "S015",          2L,          1L,          1L,          2L,            6L
)

data_lvl2<-tibble::tribble(
    ~student_id, ~l2c1_score, ~l2c2_score, ~l2c3_score, ~l2c4_score, ~l2c5_score, ~cog_total_l2,
    "S016",          2L,          2L,          1L,          2L,          2L,            9L,
    "S017",          2L,          2L,          2L,          2L,          1L,            9L,
    "S018",          2L,          2L,          2L,          2L,          1L,            9L,
    "S019",          2L,          4L,          3L,          2L,          1L,           12L,
    "S020",          4L,          4L,          3L,          2L,          1L,           14L,
    "S021",          4L,          4L,          1L,          2L,          1L,           12L,
    "S022",          4L,          2L,          1L,          2L,          2L,           11L,
    "S023",          5L,          2L,          1L,          3L,          2L,           13L,
    "S024",          5L,          2L,          1L,          3L,          2L,           13L,
    "S025",          2L,          4L,          2L,          3L,          1L,           12L,
    "S026",          5L,          4L,          3L,          4L,          2L,           18L,
    "S027",          5L,          4L,          3L,          4L,          1L,           17L,
    "S028",          1L,          1L,          1L,          4L,          1L,            8L,
    "S029",          1L,          1L,          1L,          2L,          3L,            8L,
    "S030",          1L,          1L,          1L,          2L,          3L,            8L
)

data_lvl3<-tibble::tribble(
    ~student_id, ~l3c1_score, ~l3c2_score, ~l3c3_score, ~l3c4_score, ~l3c5_score, ~l3c6_score, ~cog_total_l3,
    "S031",          4L,          1L,          3L,          1L,          3L,          3L,           15L,
    "S032",          4L,          3L,          2L,          6L,          0L,          4L,           19L,
    "S033",          4L,          4L,          0L,          6L,          5L,          4L,           23L,
    "S034",          4L,          1L,          0L,          6L,          4L,          2L,           17L,
    "S035",          4L,          6L,          2L,          3L,          5L,          2L,           22L,
    "S036",          2L,          0L,          5L,          2L,          1L,          2L,           12L,
    "S037",          2L,          5L,          4L,          3L,          4L,          2L,           20L,
    "S038",          2L,          2L,          6L,          3L,          3L,          2L,           18L,
    "S039",          2L,          5L,          0L,          4L,          0L,          2L,           13L,
    "S040",          6L,          3L,          0L,          5L,          2L,          2L,           18L,
    "S041",          6L,          6L,          0L,          5L,          1L,          2L,           20L,
    "S042",          6L,          0L,          6L,          4L,          3L,          2L,           21L,
    "S043",          6L,          6L,          3L,          4L,          2L,          2L,           23L,
    "S044",          2L,          0L,          6L,          5L,          1L,          4L,           18L,
    "S045",          2L,          5L,          0L,          0L,          6L,          4L,           17L
)

data_frame_list <- ls(pattern = "^data_lvl\\d$")

map_dfr(.x = data_frame_list,
        .f = ~{
            get(.x) %>% 
                set_names(str_remove(names(get(.x)), "_?l\\d"))
        })
#> # A tibble: 45 × 8
#>    student_id c1_score c2_score c3_score c4_score cog_total c5_score c6_score
#>    <chr>         <int>    <int>    <int>    <int>     <int>    <int>    <int>
#>  1 S001              2        2        2        1         7       NA       NA
#>  2 S002              2        1        2        2         7       NA       NA
#>  3 S003              4        1        2        3        10       NA       NA
#>  4 S004              6        1        1        2        10       NA       NA
#>  5 S005              2        1        1        3         7       NA       NA
#>  6 S006              2        2        1        3         8       NA       NA
#>  7 S007              2        2        1        4         9       NA       NA
#>  8 S008              2        2        1        2         7       NA       NA
#>  9 S009              2        2        2        2         8       NA       NA
#> 10 S010              4        1        2        4        11       NA       NA
#> # … with 35 more rows

# Retrive data from column names and reshape into long format
map_dfr(.x = data_frame_list,
        .f = ~{
            get(.x) %>%
                select(-starts_with("cog_total_")) %>% 
                pivot_longer(-student_id, names_to = c("level", "course"), names_pattern = "(l\\d)(c\\d)", values_to = "score")
        })
#> # A tibble: 225 × 4
#>    student_id level course score
#>    <chr>      <chr> <chr>  <int>
#>  1 S001       l1    c1         2
#>  2 S001       l1    c2         2
#>  3 S001       l1    c3         2
#>  4 S001       l1    c4         1
#>  5 S002       l1    c1         2
#>  6 S002       l1    c2         1
#>  7 S002       l1    c3         2
#>  8 S002       l1    c4         2
#>  9 S003       l1    c1         4
#> 10 S003       l1    c2         1
#> # … with 215 more rows

^{Created on 2022-03-31 by the reprex package (v2.0.1)}

kuttan98 · April 1, 2022, 4:09am

this code is far efficient. But with my dataset, I don't think this will be the apt solution. One major problem is the question number coding. For level-1, there will be one question l1c5 which will have to be grouped with l2c4 and l3c4. this situation comes again and again in our survey.

andresrcs · April 1, 2022, 12:13pm

Then I would go with the second option I gave you since regrouping would be way easier and it gives you the opportunity to retain all the original data (including data encoded in the column names) for future use.

kuttan98 · April 5, 2022, 7:19am

ok thanks a lot for this

system · April 12, 2022, 7:20am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.