Creating stacked column grouped by calculated variable

Hi everyone,

I'm trying to find more streamlined ways of doing things but don't know how to:

  1. add up several columns ready for plotting into a bar graph;
  2. create a new variable using conditional logic across multiple columns;
  3. combine the results of 1 and 2 in a stacked bar graph/geom_bar (although, I think I might have this one and just want to check that I'm on the right track)

My sample dataframe is:

                    ID = c(1L, 2L, 3L, 4L, 5L, 6L),
             More.Info = c(1L, 1L, NA, 1L, NA, 1L),
  Better.Understanding = c(NA, 1L, 1L, 1L, NA, 1L),
            Reg.Reform = c(NA, 1L, 1L, NA, NA, 1L),
         More.capacity = c(NA, 1L, NA, NA, 1L, 1L),
                 Other = c(1L, NA, NA, NA, NA, NA),
               Group.A = c(1L, 3L, 2L, NA, 3L, 2L),
               Group.B = c(2L, 1L, NA, 2L, 2L, 2L),
               Group.C = c(1L, 1L, 3L, 1L, NA, NA),
               Group.D = c(1L, 1L, 2L, 1L, 3L, 1L),
               Group.E = c(2L, 3L, 3L, NA, NA, 3L),
               Group.F = c(3L, 3L, 3L, NA, 2L, 1L),
               Group.G = c(1L, 2L, 1L, 1L, 3L, 1L),
               Group.H = c(3L, 3L, 1L, 1L, 2L, 2L),
               Group.I = c(3L, 3L, NA, 1L, 1L, 3L),
               Group.J = c(3L, 2L, 2L, 3L, 2L, NA),
               Group.K = c(1L, 1L, 2L, 1L, 3L, 2L),
               Group.L = c(1L, 1L, 2L, 3L, NA, 3L),
               Group.M = c(1L, 3L, 3L, NA, 1L, 2L),
               Group.N = c(3L, 1L, NA, 3L, 3L, 1L),
               Group.O = c(3L, 3L, 2L, 2L, 2L, 3L)
                    )                    

The columns "More.Info", "Better.Understanding", "Reg.Reform", "More.capacity" and "Other" the types of assistance survey respondents selected. To plot this information, I would normally switch over to Excel and tally up each of these columns and create a manual dataframe e.g.

cols<-c("More.Info", "Better.Understanding", "Reg.Reform", "More.Capacity", "Other")
responses<-c(7, 6, 5, 4, 1)

df<-tibble(cols, responses)
head(df)

df %>%
  ggplot(aes(x = cols, y = responses))+
  geom_col()

Is there an easier way to do this in R? I have tried the following code (different iterations, not all one code) - none of them work:

tally<-sdat%>%
  summarise(count(Better.Understanding))%>%
  summarise(count(More.capacity))%>%
  summarise(count(More.Info))%>%
  summarise(count(Other))%>%
  summarise(count(Reg.Reform))

tally<-sdat%>%
  count(Better.Understanding)%>%
  count(More.capacity)%>%
  count(More.Info)%>%
  count(Other)%>%
  count(Reg.Reform)

tally<-sdat%>%
  summarise(sum(Better.Understanding))%>%
  summarise(sum(More.capacity))%>%
  summarise(sum(More.Info))%>%
  summarise(sum(Other))%>%
  summarise(sum(Reg.Reform))

tally<-sdat%>%
  cumsum(Better.Understanding)%>%
  cumsum(More.capacity)%>%
  cumsum(More.Info)%>%
  cumsum(Other)%>%
  cumsum(Reg.Reform)

I then want to calculate (from the remaining variables) the respondents level of engagement in professional organisations (Group A-0). To calculate a new column 'Engagement', a respondent would need to have a "1" (=Well Connected) or "2"(=Somewhat Connected) in any of the columns GroupA:GroupO.

I tried the following code, but it does not work:

df <- within(df, {
  Engagement <- NA
  Engagement["Group.A":"Group.O" < 3] <- "Engaged"
  Engagement["Group.A":"Group.O" >= 3] <- "Not Engaged"
})

Once I've calculated the Engagement variable, could I plot the types of assistance (i.e. "More.Info", "Better.Understanding", "Reg.Reform", "More.capacity" and "Other") along the x axis, grouped by "Engagement" using the following code:

df %>% 
  ggplot(aes(x = 2:6, y = tally, group = Engagement, fill = Engagement)) +
  geom_histogram(stat='identity', alpha = 0.4, width = 0.9)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

Apologies for the multiple questions - I thought it might be more efficient than multiple posts given it is all related.

Thank you, in advance, for any help or guidance you can provide.

Here is one method to get your first plot. All of the steps could have been chained together but I broke them out to make the process easier to follow.

library(tidyverse)

DF <- data.frame(ID = c(1L, 2L, 3L, 4L, 5L, 6L),
More.Info = c(1L, 1L, NA, 1L, NA, 1L),
Better.Understanding = c(NA, 1L, 1L, 1L, NA, 1L),
Reg.Reform = c(NA, 1L, 1L, NA, NA, 1L),
More.capacity = c(NA, 1L, NA, NA, 1L, 1L),
Other = c(1L, NA, NA, NA, NA, NA),
Group.A = c(1L, 3L, 2L, NA, 3L, 2L),
Group.B = c(2L, 1L, NA, 2L, 2L, 2L),
Group.C = c(1L, 1L, 3L, 1L, NA, NA),
Group.D = c(1L, 1L, 2L, 1L, 3L, 1L),
Group.E = c(2L, 3L, 3L, NA, NA, 3L),
Group.F = c(3L, 3L, 3L, NA, 2L, 1L),
Group.G = c(1L, 2L, 1L, 1L, 3L, 1L),
Group.H = c(3L, 3L, 1L, 1L, 2L, 2L),
Group.I = c(3L, 3L, NA, 1L, 1L, 3L),
Group.J = c(3L, 2L, 2L, 3L, 2L, NA),
Group.K = c(1L, 1L, 2L, 1L, 3L, 2L),
Group.L = c(1L, 1L, 2L, 3L, NA, 3L),
Group.M = c(1L, 3L, 3L, NA, 1L, 2L),
Group.N = c(3L, 1L, NA, 3L, 3L, 1L),
Group.O = c(3L, 3L, 2L, 2L, 2L, 3L)
)    


DF_Long <- DF |> select(ID:Other) |> 
  pivot_longer(More.Info:Other, names_to = "Type", values_to = "Value")
DF_Long
#> # A tibble: 30 × 3
#>       ID Type                 Value
#>    <int> <chr>                <int>
#>  1     1 More.Info                1
#>  2     1 Better.Understanding    NA
#>  3     1 Reg.Reform              NA
#>  4     1 More.capacity           NA
#>  5     1 Other                    1
#>  6     2 More.Info                1
#>  7     2 Better.Understanding     1
#>  8     2 Reg.Reform               1
#>  9     2 More.capacity            1
#> 10     2 Other                   NA
#> # … with 20 more rows
DF_Summ <- DF_Long |> group_by(Type) |> summarize(Count = sum(Value, na.rm = TRUE))
DF_Summ
#> # A tibble: 5 × 2
#>   Type                 Count
#>   <chr>                <int>
#> 1 Better.Understanding     4
#> 2 More.capacity            3
#> 3 More.Info                4
#> 4 Other                    1
#> 5 Reg.Reform               3
ggplot(DF_Summ, aes(Type, Count)) + geom_col()

Created on 2022-11-22 with reprex v2.0.2

I do not understand the goal of the "Engagement" plot. Is the Engagement column a simple yes/no if there are any values of 1 or 2 in Group.A:Group.O? Or is it a count of how many values of 1 or 2 occur in each row? If it is the first alternative, it seems every row would have Engagement = Yes. For the second alternative, it would be strange to group a plot by a continuous variable. Can you clear up my confusion?

Thank you very much @FJCC. I'll be able to learn, adapt and repurpose that code. Much appreciated. It's good to know there is no 'count by column' per se but you have to go down the pivot_longer route and then group by Type.

Regarding the 'Engagement' variable; yes, I was envisioning the creation of a simple dichotomy (yes/no or engaged/not_engaged) if a respondent answered 1 or 2 in Group.A:Group.O. Your observation that " it seems every row would have Engagement = Yes" is correct, however, this is just a sample dataset that I've created. The actual dataset reflects a very low level of engagement. The aim of incorporating 'Engagement' into the above geom_col is to highlight that respondents are wanting more information but not really engaging with the organisations that are best placed to provide the information.

I hope this explains things a little better.

To try to create the 'Engagement' variable I've played around with mutate_if, mutate_at and case_when, rather that within without any success. I'm getting stuck with applying conditional logic across multiple columns. Is this another situation for pivot_longer and then mutate_if?

Thank you, again, for taking the time to help me.

Calculating across columns is less convenient but it can be done with the rowwise() and c_across() functions from dplyr. Does this code do what you want?

library(tidyverse)

DF <- data.frame(ID = c(1L, 2L, 3L, 4L, 5L, 6L),
More.Info = c(1L, 1L, NA, 1L, NA, 1L),
Better.Understanding = c(NA, 1L, 1L, 1L, NA, 1L),
Reg.Reform = c(NA, 1L, 1L, NA, NA, 1L),
More.capacity = c(NA, 1L, NA, NA, 1L, 1L),
Other = c(1L, NA, NA, NA, NA, NA),
Group.A = c(1L, 3L, 2L, NA, 3L, 2L),
Group.B = c(2L, 1L, NA, 2L, 2L, 2L),
Group.C = c(1L, 1L, 3L, 1L, NA, NA),
Group.D = c(1L, 1L, 2L, 1L, 3L, 1L),
Group.E = c(2L, 3L, 3L, NA, NA, 3L),
Group.F = c(3L, 3L, 3L, NA, 2L, 1L),
Group.G = c(1L, 2L, 1L, 1L, 3L, 1L),
Group.H = c(3L, 3L, 1L, 1L, 2L, 2L),
Group.I = c(3L, 3L, NA, 1L, 1L, 3L),
Group.J = c(3L, 2L, 2L, 3L, 2L, NA),
Group.K = c(1L, 1L, 2L, 1L, 3L, 2L),
Group.L = c(1L, 1L, 2L, 3L, NA, 3L),
Group.M = c(1L, 3L, 3L, NA, 1L, 2L),
Group.N = c(3L, 1L, NA, 3L, 3L, 1L),
Group.O = c(3L, 3L, 2L, 2L, 2L, 3L)
)    

DF <- DF |> rowwise() |> 
  mutate(Engagement = any(c_across(Group.A:Group.O) <= 2))

DF_Long <- DF |> select(ID:Other, Engagement) |> 
  pivot_longer(More.Info:Other, names_to = "Type", values_to = "Value")
DF_Summ <- DF_Long |> group_by(Type, Engagement) |> 
  summarize(Count = sum(Value, na.rm = TRUE))
#> `summarise()` has grouped output by 'Type'. You can override using the
#> `.groups` argument.
DF_Summ
#> # A tibble: 5 × 3
#> # Groups:   Type [5]
#>   Type                 Engagement Count
#>   <chr>                <lgl>      <int>
#> 1 Better.Understanding TRUE           4
#> 2 More.capacity        TRUE           3
#> 3 More.Info            TRUE           4
#> 4 Other                TRUE           1
#> 5 Reg.Reform           TRUE           3

DF_Summ %>% 
  ggplot(aes(x = Type, y = Count, fill = Engagement)) +
  geom_col(alpha = 0.4, width = 0.9)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

Created on 2022-11-22 with reprex v2.0.2

1 Like

Thank you so much @FJCC - that worked perfectly! Exactly what I was after. I'm already having fun applying it to other variables :grinning:.

Thank you again!

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.