summarizing by group and linking the total group n with individual id

cactus · September 1, 2021, 9:21pm

I have a dataframe dfcolor.
subjectid is the unique id of each individual and grpmonth1:grpmonth5 are columns showing where each individual are assigned each month (total 5 month) across three group (RED, GREEN, BLUE)

dfcolor <- data.frame(
  subjectid = c(1,2,3,4,5,6,7,8,9,10,11),
  grpmonth1 = c("RED", "BLUE", "GREEN", "RED", "GREEN", "RED", "GREEN", "GREEN", "BLUE", "GREEN", "BLUE"),
  grpmonth2 = c("RED", "RED", "GREEN", "RED", "RED", "RED", "GREEN", "GREEN", "RED", "GREEN", "BLUE"),
  grpmonth3 = c("GREEN", "BLUE", "GREEN", "RED", "GREEN", "RED", "GREEN", "BLUE", "RED", "GREEN", "BLUE"),
  grpmonth4 = c("BLUE", "BLUE", "GREEN", "RED", "GREEN", "RED", "RED", "GREEN", "RED", "GREEN", "BLUE"),
  grpmonth5 = c("RED", "BLUE", "RED", "RED", "RED", "RED", "GREEN", "GREEN", "RED", "BLUE", "BLUE"))
dfcolor
#>    subjectid grpmonth1 grpmonth2 grpmonth3 grpmonth4 grpmonth5
#> 1          1       RED       RED     GREEN      BLUE       RED
#> 2          2      BLUE       RED      BLUE      BLUE      BLUE
#> 3          3     GREEN     GREEN     GREEN     GREEN       RED
#> 4          4       RED       RED       RED       RED       RED
#> 5          5     GREEN       RED     GREEN     GREEN       RED
#> 6          6       RED       RED       RED       RED       RED
#> 7          7     GREEN     GREEN     GREEN       RED     GREEN
#> 8          8     GREEN     GREEN      BLUE     GREEN     GREEN
#> 9          9      BLUE       RED       RED       RED       RED
#> 10        10     GREEN     GREEN     GREEN     GREEN      BLUE
#> 11        11      BLUE      BLUE      BLUE      BLUE      BLUE
Created on 2021-09-01 by the reprex package (v2.0.1)

Now I need to create 5 set of columns showing how many individual were there in each month for the group that particular individual is in that particular month. Below I can see how many individuals were there in each group/each month manually.

totalmnth1 <- dfcolor %>% group_by(grpmonth1) %>% summarize(n())
totalmnth2 <- dfcolor %>% group_by(grpmonth2) %>% summarize(n())
totalmnth3 <- dfcolor %>% group_by(grpmonth3) %>% summarize(n())
totalmnth4 <- dfcolor %>% group_by(grpmonth4) %>% summarize(n())
totalmnth5 <- dfcolor %>% group_by(grpmonth5) %>% summarize(n())

But how do I link this with the main df i.e. dfcolor easily and build the five columns that I need that links each individual with number of group members in the group they are in each month?
Lets say for "subjectid" 1, he is in group "red" in month 1 and so his value for the new column ttlgrppl_m1 will be total people who were in group "red" during month 1.

A mock example I manually made just to illustrate how I want my final data to look like (NOTE: the numbers I kept there are random),

dfcolornew
#>    subjectid grpmonth1 grpmonth2 grpmonth3 grpmonth4 grpmonth5 ttlgrppl_m1
#> 1          1       RED       RED     GREEN      BLUE       RED           3
#> 2          2      BLUE       RED      BLUE      BLUE      BLUE           2
#> 3          3     GREEN     GREEN     GREEN     GREEN       RED           1
#> 4          4       RED       RED       RED       RED       RED           3
#> 5          5     GREEN       RED     GREEN     GREEN       RED           2
#> 6          6       RED       RED       RED       RED       RED           1
#> 7          7     GREEN     GREEN     GREEN       RED     GREEN           3
#> 8          8     GREEN     GREEN      BLUE     GREEN     GREEN           2
#> 9          9      BLUE       RED       RED       RED       RED           1
#> 10        10     GREEN     GREEN     GREEN     GREEN      BLUE           3
#> 11        11      BLUE      BLUE      BLUE      BLUE      BLUE           2
#>    ttlgrppl_m2 ttlgrppl_m3 ttlgrppl_m4 ttlgrppl_m5
#> 1            3           3           3           3
#> 2            3           1           2           2
#> 3            1           1           1           1
#> 4            3           2           3           2
#> 5            2           2           2           2
#> 6            1           3           2           1
#> 7            3           3           3           1
#> 8            2           2           1           2
#> 9            1           1           1           1
#> 10           3           3           3           3
#> 11           2           2           2           2
Created on 2021-09-01 by the reprex package (v2.0.1)

P.S I need to work with huge number of group values so I need to use a range. E.g my group can be from group 1:5000. If you could also include that in your suggestions, that will be awesome !
Thank you R community !

HanOostdijk · September 1, 2021, 10:21pm

Is this what you want to do :

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)

dfcolor <- data.frame(
  subjectid = c(1,2,3,4,5,6,7,8,9,10,11),
  grpmonth1 = c("RED", "BLUE", "GREEN", "RED", "GREEN", "RED", "GREEN", "GREEN", "BLUE", "GREEN", "BLUE"),
  grpmonth2 = c("RED", "RED", "GREEN", "RED", "RED", "RED", "GREEN", "GREEN", "RED", "GREEN", "BLUE"),
  grpmonth3 = c("GREEN", "BLUE", "GREEN", "RED", "GREEN", "RED", "GREEN", "BLUE", "RED", "GREEN", "BLUE"),
  grpmonth4 = c("BLUE", "BLUE", "GREEN", "RED", "GREEN", "RED", "RED", "GREEN", "RED", "GREEN", "BLUE"),
  grpmonth5 = c("RED", "BLUE", "RED", "RED", "RED", "RED", "GREEN", "GREEN", "RED", "BLUE", "BLUE"))

dfcolor_long <- dfcolor %>%
  tidyr::pivot_longer(names_to = 'grpmonth', values_to = 'color',cols=grpmonth1:grpmonth5)

dfcolor_long <- dfcolor_long %>%
  group_by(grpmonth,color) %>%
  mutate(n=n())

dfcolor_wide <- dfcolor_long %>%
  tidyr::pivot_wider(id_cols = subjectid,names_from=grpmonth,values_from = c(color,n))

print(as.data.frame(dfcolor_wide))
#>    subjectid color_grpmonth1 color_grpmonth2 color_grpmonth3 color_grpmonth4
#> 1          1             RED             RED           GREEN            BLUE
#> 2          2            BLUE             RED            BLUE            BLUE
#> 3          3           GREEN           GREEN           GREEN           GREEN
#> 4          4             RED             RED             RED             RED
#> 5          5           GREEN             RED           GREEN           GREEN
#> 6          6             RED             RED             RED             RED
#> 7          7           GREEN           GREEN           GREEN             RED
#> 8          8           GREEN           GREEN            BLUE           GREEN
#> 9          9            BLUE             RED             RED             RED
#> 10        10           GREEN           GREEN           GREEN           GREEN
#> 11        11            BLUE            BLUE            BLUE            BLUE
#>    color_grpmonth5 n_grpmonth1 n_grpmonth2 n_grpmonth3 n_grpmonth4 n_grpmonth5
#> 1              RED           3           6           5           3           6
#> 2             BLUE           3           6           3           3           3
#> 3              RED           5           4           5           4           6
#> 4              RED           3           6           3           4           6
#> 5              RED           5           6           5           4           6
#> 6              RED           3           6           3           4           6
#> 7            GREEN           5           4           5           4           2
#> 8            GREEN           5           4           3           4           2
#> 9              RED           3           6           3           4           6
#> 10            BLUE           5           4           5           4           3
#> 11            BLUE           3           1           3           3           3
Created on 2021-09-02 by the reprex package (v2.0.0)

cactus · September 2, 2021, 4:43pm

Thank you !
This is such a cool way of doing it. I was able to do this in the mock dataset.
However, I tried to replicate this into my real dataset and the output got too big to handle. This is the error I got.

dfcolor_long <- dfcolor %>%
  tidyr::pivot_longer(names_to = 'grpmonth', values_to = 'color',cols=grpmonth1:grpmonth250)

`Error: cannot allocate vector of size 112.2 Mb`

For the context, my data has around 12000 subjects (i.e. subject id in the mock dataset I put up) and has month extending from month 1: month 250. Any idea on making this work in such large dataset?
Thanks a lot again @HanOostdijk !

HanOostdijk · September 2, 2021, 5:39pm

I am sorry to hear that. Avoiding memory problems is a field on its own!

I see two possible ways to avoid this:

if your values are indeed characters (as in your mock example) convert them to factors. These take less memory than character strings.
you could limit the number of columns that you handle in one go, because each column can be handled on its own. Afterwards you would have to cbind the results.

Maybe it would help if you could tell us where this data comes from and what is your 'final' goal (?)

clausp · September 2, 2021, 5:40pm

Take a look at memory.limit() if you are on Windows and set the appropriate size.

If you have a Mac and run into the problem "vector memory exhausted (limit reached?)" and have R 3.5 or later, you will need to explicitly set the environment variable R_MAX_VSIZE. You do that by going to your home directory (~), create the file .Renviron, and add something like R_MAX_VSIZE=100Gb in the first line.
Restart RStudio and it should work, although you might have to change the size of the allocation. More information in available on Stack Overflow.

cactus · September 2, 2021, 6:07pm

Just checked and they are factors but the 2nd option sounds feasible (though tedious) ! haha
The data is not public and on a secure server so I don't have lot of flexibility modifying the system. Right now I am just trying to create new variables to play with different models.

Let me try to do the 2nd option and once again thank you for the support

cactus · September 2, 2021, 6:08pm

Thank you !
Just checked and it is ~ 1.5 GB (16383)
Working on a mac but on remote desktop. Is it the same process?

clausp · September 2, 2021, 6:14pm

Should be (high among famous last words!). The exception is if your remote system is a RStudio Workbench (formerly Server Pro) in which case you should probably talk your to local friendly system admin.

HanOostdijk · September 2, 2021, 9:53pm

Hello @cactus ,
you could also try this :

dfcolor <- data.frame(
  subjectid = c(1,2,3,4,5,6,7,8,9,10,11),
  grpmonth1 = c("RED", "BLUE", "GREEN", "RED", "GREEN", "RED", "GREEN", "GREEN", "BLUE", "GREEN", "BLUE"),
  grpmonth2 = c("RED", "RED", "GREEN", "RED", "RED", "RED", "GREEN", "GREEN", "RED", "GREEN", "BLUE"),
  grpmonth3 = c("GREEN", "BLUE", "GREEN", "RED", "GREEN", "RED", "GREEN", "BLUE", "RED", "GREEN", "BLUE"),
  grpmonth4 = c("BLUE", "BLUE", "GREEN", "RED", "GREEN", "RED", "RED", "GREEN", "RED", "GREEN", "BLUE"),
  grpmonth5 = c("RED", "BLUE", "RED", "RED", "RED", "RED", "GREEN", "GREEN", "RED", "BLUE", "BLUE"),
  stringsAsFactors = T)

grps <- setdiff(names(dfcolor),c("subjectid"))

for (grp in grps) {
  counts <- table(dfcolor[grp])
  n <- as.numeric(counts[ unlist(dfcolor[grp])])
  dfcolor[paste("n",grp,sep="_")] <- n
}

dfcolor
#>    subjectid grpmonth1 grpmonth2 grpmonth3 grpmonth4 grpmonth5 n_grpmonth1
#> 1          1       RED       RED     GREEN      BLUE       RED           3
#> 2          2      BLUE       RED      BLUE      BLUE      BLUE           3
#> 3          3     GREEN     GREEN     GREEN     GREEN       RED           5
#> 4          4       RED       RED       RED       RED       RED           3
#> 5          5     GREEN       RED     GREEN     GREEN       RED           5
#> 6          6       RED       RED       RED       RED       RED           3
#> 7          7     GREEN     GREEN     GREEN       RED     GREEN           5
#> 8          8     GREEN     GREEN      BLUE     GREEN     GREEN           5
#> 9          9      BLUE       RED       RED       RED       RED           3
#> 10        10     GREEN     GREEN     GREEN     GREEN      BLUE           5
#> 11        11      BLUE      BLUE      BLUE      BLUE      BLUE           3
#>    n_grpmonth2 n_grpmonth3 n_grpmonth4 n_grpmonth5
#> 1            6           5           3           6
#> 2            6           3           3           3
#> 3            4           5           4           6
#> 4            6           3           4           6
#> 5            6           5           4           6
#> 6            6           3           4           6
#> 7            4           5           4           2
#> 8            4           3           4           2
#> 9            6           3           4           6
#> 10           4           5           4           3
#> 11           1           3           3           3
Created on 2021-09-02 by the reprex package (v2.0.0)

cactus · September 4, 2021, 6:34pm

This worked perfectly. I had to delete all the columns except those that I wanted to work in to avoid the continuous errors. But once that was done I was able to do the calculation, create a new file and merge it back to the new file.
For someone who just started R, it took sometime to understand the codes but it was a nice learning Thanks @HanOostdijk

system · September 11, 2021, 6:35pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.