Rearranging data and selection of variables

Hi.
I have data of students with various parameters. The student names are mentioned initially (with variable name "cal_name") and the responses given (as 0 or 1) in for parameters (given as q_1_1, q_1_2,.., etc.). q_1_1 till q_1_11 are the parameters for student1, q_2_1 till q_2_11 are for student2 and so on.
I have done written a code but this doesn't work for me as the final file is not as per my requirement. I need the data in the vertical format with the student names/ids and their responses to the parameters given. But here a large number of duplicates are formed. Can I solve this issue?

library(tidyverse)

data1<-tibble::tribble(
       ~schoolid,     ~teacherid,                  ~cal_name1,                  ~cal_name2,                         ~cal_name3,               ~cal_name4,              ~cal_name5,          ~q_1, ~q_1_1, ~q_1_2, ~q_1_3, ~q_1_4, ~q_1_5, ~q_1_6, ~q_1_7, ~q_1_8, ~q_1_9, ~q_1_10, ~q_1_11,   ~q_2, ~q_2_1, ~q_2_2, ~q_2_3, ~q_2_4, ~q_2_5, ~q_2_6, ~q_2_7, ~q_2_8, ~q_2_9, ~q_2_10, ~q_2_11,       ~q_3, ~q_3_1, ~q_3_2, ~q_3_3, ~q_3_4, ~q_3_5, ~q_3_6, ~q_3_7, ~q_3_8, ~q_3_9, ~q_3_10, ~q_3_11,  ~q_4, ~q_4_1, ~q_4_2, ~q_4_3, ~q_4_4, ~q_4_5, ~q_4_6, ~q_4_7, ~q_4_8, ~q_4_9, ~q_4_10, ~q_4_11,       ~q_5, ~q_5_1, ~q_5_2, ~q_5_3, ~q_5_4, ~q_5_5, ~q_5_6, ~q_5_7, ~q_5_8, ~q_5_9, ~q_5_10, ~q_5_11,
  "DPEP Kusugal",    "Manjula A",     "Anand Byahatti-9AHCMX", "Apsana. S Hubballi-R0UQTS", "Fathima Begum.R.Kawalikai-ABUFRY", "Jyothi Byahatti-7I6QJM", "Momadsad Huggi-TFN2GP",        "1 10",     1L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,      1L,      0L,   "10",     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,      1L,      0L,     "4 10",     0L,     0L,     0L,     1L,     0L,     0L,     0L,     0L,     0L,      1L,      0L, "5 8",     0L,     0L,     0L,     0L,     1L,     0L,     0L,     1L,     0L,      0L,      0L,        "4",     0L,     0L,     0L,     1L,     0L,     0L,     0L,     0L,     0L,      0L,      0L,
  "GHPS Giriyal", "B.M.Hosamani", "Netra S. Kattimani-FAU4EG",     "Preeti Salagar-ALWZO7",            "Shrikant Jadav-1LAF0L",                       NA,                      NA, "1 3 8 10 11",     1L,     0L,     1L,     0L,     0L,     0L,     0L,     1L,     0L,      1L,      1L, "7 11",     0L,     0L,     0L,     0L,     0L,     0L,     1L,     0L,     0L,      0L,      1L,         NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,    NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,         NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,
  "DPEP Kusugal",    "Manjula A",     "Anand Byahatti-9AHCMX", "Apsana. S Hubballi-R0UQTS", "Fathima Begum.R.Kawalikai-ABUFRY", "Jyothi Byahatti-7I6QJM", "Momadsad Huggi-TFN2GP",       "3 6 9",     0L,     0L,     1L,     0L,     0L,     1L,     0L,     0L,     1L,      0L,      0L,  "4 7",     0L,     0L,     0L,     1L,     0L,     0L,     1L,     0L,     0L,      0L,      0L,      "3 9",     0L,     0L,     1L,     0L,     0L,     0L,     0L,     0L,     1L,      0L,      0L,   "5",     0L,     0L,     0L,     0L,     1L,     0L,     0L,     0L,     0L,      0L,      0L,        "7",     0L,     0L,     0L,     0L,     0L,     0L,     1L,     0L,     0L,      0L,      0L,
  "GHPS Giriyal", "B.M.Hosamani", "Netra S. Kattimani-FAU4EG",     "Preeti Salagar-ALWZO7",            "Shrikant Jadav-1LAF0L",                       NA,                      NA,         "3 4",     0L,     0L,     1L,     1L,     0L,     0L,     0L,     0L,     0L,      0L,      0L,     NA,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,      0L,      0L,         NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,    NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,         NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,
  "GHPS Giriyal", "B.M.Hosamani", "Netra S. Kattimani-FAU4EG",     "Preeti Salagar-ALWZO7",            "Shrikant Jadav-1LAF0L",                       NA,                      NA,            NA,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,      0L,      0L,  "3 9",     0L,     0L,     1L,     0L,     0L,     0L,     0L,     0L,     1L,      0L,      0L,         NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,    NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,         NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,
  "DPEP Kusugal",    "Manjula A",     "Anand Byahatti-9AHCMX", "Apsana. S Hubballi-R0UQTS", "Fathima Begum.R.Kawalikai-ABUFRY", "Jyothi Byahatti-7I6QJM", "Momadsad Huggi-TFN2GP",       "1 5 7",     1L,     0L,     0L,     0L,     1L,     0L,     1L,     0L,     0L,      0L,      0L,  "4 8",     0L,     0L,     0L,     1L,     0L,     0L,     0L,     1L,     0L,      0L,      0L, "4 6 8 10",     0L,     0L,     0L,     1L,     0L,     1L,     0L,     1L,     0L,      1L,      0L, "5 6",     0L,     0L,     0L,     0L,     1L,     1L,     0L,     0L,     0L,      0L,      0L, "2 4 8 10",     0L,     1L,     0L,     1L,     0L,     0L,     0L,     1L,     0L,      1L,      0L
  )
Students <- data1 |> 
  select(schoolid, starts_with("cal_name"))
Students <- Students |> 
  pivot_longer(cols = -schoolid,names_to = "ID",values_to = "Name") |> 
  mutate(ID=str_remove(ID,"cal_name")) |> 
  filter(!is.na(Name))

Scores <- data1 |>
  select(schoolid,matches("q_\\d+_\\d+"))
Scores <- Scores |> 
  pivot_longer(cols = -schoolid,names_to = "ID",values_to = "Score") |> 
  mutate(Q=str_remove(ID,"_\\d+"),
         ID=str_extract(ID,"\\d+")) |> 
  filter(!is.na(Score)) |> 
  pivot_wider(names_from = "Q",values_from = "Score")
#> Warning: Values are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list` to suppress this warning.
#> * Use `values_fn = length` to identify where the duplicates arise
#> * Use `values_fn = {summary_fun}` to summarise duplicates
Final1 <- inner_join(Students,Scores,by="schoolid")

So it looks like R drops a warning when you run the pivot_wider statement. If you follow that warning with a little bit of modification...

image

Perhaps this is what you are looking for:

library(tidyverse)
library(dplyr)

Students <- data1 |> 
  select(schoolid, starts_with("cal_name")) |> 
  pivot_longer(cols = -schoolid,names_to = "ID",values_to = "Name") |> 
  mutate(ID=str_remove(ID,"cal_name")) |> 
  filter(!is.na(Name))

Scores <- data1 |>
  select(schoolid,matches("q_\\d+_\\d+")) |> 
  pivot_longer(cols = -schoolid,names_to = "ID",values_to = "Score") |> 
  mutate(Q=str_remove(ID,"_\\d+"),
         ID=str_extract(ID,"\\d+")) |> 
  filter(!is.na(Score))|> 
    dplyr::group_by(schoolid, ID, Q,Score) %>%
    dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
    dplyr::filter(n > 1L) %>%
  pivot_wider(names_from = "Q",values_from = "Score")

Final1 <- inner_join(Students,Scores,by="schoolid")

When I run this code, many duplicates are created in the Final1 data frame.

Ah I think your original request was a little unclear. Your inner_join requires multiple keys to join on to reduce duplication (especially ID). Otherwise, schoolid and ID pairs will be duplicated multiplicatively.

After that you can do a group_by() on anything you don't need to aggregate and a max() to further summarize the results

Final1 <- inner_join(Students, Scores, by = c("schoolid" = "schoolid", "ID"="ID") ) %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%  #change NAs to 0s for numerica columns
  group_by(schoolid, ID, Name)%>%  
  summarise(across(everything(),max, na.rm=TRUE))

You should get something like this:

The question numbers are out of order, but you can fix that with a select or something like that.

Thanks a lot for this help. Sorry to ask again, but I am unable to get the desired result. In the following data frame "data1", I have in the schoolid variable 2 schools (one appearing once and the other appearing twice). When I run the code, it gives me only the students of one school. The other one is not there. Is it because of group_by?

library(tidyverse)
library(janitor)
#> 
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#> 
#>     chisq.test, fisher.test
data1<-tibble::tribble(
              ~schoolid,      ~teacherid,                  ~cal_name1,                  ~cal_name2,                         ~cal_name3,               ~cal_name4,              ~cal_name5,                    ~cal_name6,               ~cal_name7, ~obs, ~q_1_1, ~q_1_2, ~q_1_3, ~q_1_4, ~q_1_5, ~q_1_6, ~q_1_7, ~q_1_8, ~q_1_9, ~q_1_10, ~q_1_11,   ~q_2, ~q_2_1, ~q_2_2, ~q_2_3, ~q_2_4, ~q_2_5, ~q_2_6, ~q_2_7, ~q_2_8, ~q_2_9, ~q_2_10, ~q_2_11,   ~q_3, ~q_3_1, ~q_3_2, ~q_3_3, ~q_3_4, ~q_3_5, ~q_3_6, ~q_3_7, ~q_3_8, ~q_3_9, ~q_3_10, ~q_3_11,  ~q_4, ~q_4_1, ~q_4_2, ~q_4_3, ~q_4_4, ~q_4_5, ~q_4_6, ~q_4_7, ~q_4_8, ~q_4_9, ~q_4_10, ~q_4_11, ~q_5, ~q_5_1, ~q_5_2, ~q_5_3, ~q_5_4, ~q_5_5, ~q_5_6, ~q_5_7, ~q_5_8, ~q_5_9, ~q_5_10, ~q_5_11, ~q_6, ~q_6_1, ~q_6_2, ~q_6_3, ~q_6_4, ~q_6_5, ~q_6_6, ~q_6_7, ~q_6_8, ~q_6_9, ~q_6_10, ~q_6_11,  ~q_7, ~q_7_1, ~q_7_2, ~q_7_3, ~q_7_4, ~q_7_5, ~q_7_6, ~q_7_7, ~q_7_8, ~q_7_9, ~q_7_10, ~q_7_11,
         "DPEP Kusugal",     "Manjula A",     "Anand Byahatti-9AHCMX", "Apsana. S Hubballi-R0UQTS", "Fathima Begum.R.Kawalikai-ABUFRY", "Jyothi Byahatti-7I6QJM", "Momadsad Huggi-TFN2GP", "Rajeshwari Dalayatha-AIF3A4", "Sanjana Talavar-V6YLZE",   1L,     1L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,      1L,      0L,   "10",     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,      1L,      0L, "4 10",     0L,     0L,     0L,     1L,     0L,     0L,     0L,     0L,     0L,      1L,      0L, "5 8",     0L,     0L,     0L,     0L,     1L,     0L,     0L,     1L,     0L,      0L,      0L,   4L,     0L,     0L,     0L,     1L,     0L,     0L,     0L,     0L,     0L,      0L,      0L,   7L,     0L,     0L,     0L,     0L,     0L,     0L,     1L,     0L,     0L,      0L,      0L,   "9",     0L,     0L,     0L,     0L,     0L,     0L,     0L,     0L,     1L,      0L,      0L,
         "GHPS Giriyal",  "B.M.Hosamani", "Netra S. Kattimani-FAU4EG",     "Preeti Salagar-ALWZO7",            "Shrikant Jadav-1LAF0L",                       NA,                      NA,                            NA,                       NA,   2L,     1L,     0L,     1L,     0L,     0L,     0L,     0L,     1L,     0L,      1L,      1L, "7 11",     0L,     0L,     0L,     0L,     0L,     0L,     1L,     0L,     0L,      0L,      1L,    "5",     0L,     0L,     0L,     0L,     1L,     0L,     0L,     0L,     0L,      0L,      0L,    NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,   NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,   NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,    NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,     NA,      NA,      NA,
         "DPEP Kusugal", "Shobha Ujjain",     "Anand Byahatti-9AHCMX", "Apsana. S Hubballi-R0UQTS", "Fathima Begum.R.Kawalikai-ABUFRY", "Jyothi Byahatti-7I6QJM", "Momadsad Huggi-TFN2GP", "Rajeshwari Dalayatha-AIF3A4", "Sanjana Talavar-V6YLZE",   2L,     0L,     0L,     1L,     0L,     0L,     1L,     0L,     0L,     1L,      0L,      0L,  "4 7",     0L,     0L,     0L,     1L,     0L,     0L,     1L,     0L,     0L,      0L,      0L,  "3 9",     0L,     0L,     1L,     0L,     0L,     0L,     0L,     0L,     1L,      0L,      0L,   "5",     0L,     0L,     0L,     0L,     1L,     0L,     0L,     0L,     0L,      0L,      0L,   7L,     0L,     0L,     0L,     0L,     0L,     0L,     1L,     0L,     0L,      0L,      0L,   4L,     0L,     0L,     0L,     1L,     0L,     0L,     0L,     0L,     0L,      0L,      0L, "4 5",     0L,     0L,     0L,     1L,     1L,     0L,     0L,     0L,     0L,      0L,      0L
         )




Students <- data1 |> 
  select(schoolid, starts_with("cal_name")) |> 
  pivot_longer(cols = -schoolid,names_to = "ID",values_to = "Name") |> 
  mutate(ID=str_remove(ID,"cal_name")) |> 
  filter(!is.na(Name))

Scores <- data1 |>
  select(schoolid,matches("q_\\d+_\\d+")) |> 
  pivot_longer(cols = -schoolid,names_to = "ID",values_to = "Score") |> 
  mutate(Q=str_remove(ID,"_\\d+"),
         ID=str_extract(ID,"\\d+")) |> 
  filter(!is.na(Score))|> 
  dplyr::group_by(schoolid, ID, Q,Score) %>%
  dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n > 1L) %>%
  pivot_wider(names_from = "Q",values_from = "Score")

Final1 <- inner_join(Students, Scores, by = c("schoolid" = "schoolid", "ID"="ID") ) %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%  #change NAs to 0s for numerical columns
  group_by(schoolid, ID, Name)%>%  
  summarise(across(everything(),max, na.rm=TRUE))
#> `summarise()` has grouped output by 'schoolid', 'ID'. You can override using the `.groups` argument.
Created on 2022-06-20 by the reprex package (v2.0.1)

Ohh sorry, I think I need to filter(n>0). Isn't that right?