ggplot2 with percent as y-axis

I'm trying to get a side by side bar chart of gender with the percentage as the y-axis instead of the counts. I can do the counts but can't seem to get it manipulated to percent.

#library(tidyverse)  # may or may not need.
library(dplyr)
library(ggplot2)

## Data for national data set. 
df_1 <- data.frame(
  categorical = c("A","B","C","A","B","A","C","C","C","A","A","C","C","C","A","C","A","B","A","C"),
  indicator1   = c(1,0,1,NA,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1),
  indicator2   = c(1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0),
  indicator3   = c(1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,1,0),
  indicator4   = c(0,1,0,0,1,0,0,0,0,0,1,1,0,NA,1,0,0,0,0,0),
  indicator5   = c(0,0,1,1,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,0),
  gender       = c("M","M",NA,"F","F","F","F","F","M","U","U","F","M","M","F","F","F","U","M","F"),
  continuous1 = c(2.3,3.4,6.6,5.5,6,7,11,12.3,13,5,2.4,3.6,6.3,5.2,5,6.6,11.3,12,14,5))

print(df_1)
summary(df_1)

#set missing as zeros
df_1[is.na(df_1)] <- 0  

df_1$audience = 0
print(df_1)

## Data for client data set. 
df_2 <- data.frame(
  categorical = c("A","B","C","A","B","A","C","C","C","A"),
  indicator1   = c(1,1,1,1,1,1,1,0,0,0),
  indicator2   = c(1,1,1,1,NA,1,1,1,1,1),
  indicator3   = c(0,NA,1,0,1,0,0,0,0,0),
  indicator4   = c(1,1,1,1,1,0,0,0,0,1),
  indicator5   = c(1,1,1,1,1,0,1,0,1,1),
  continuous1 = c(2.3,3.4,6.6,5.5,6,7,11,12.3,13,5),
  gender = c("M","F","F","F","F","M","M","M","F","F"))

print(df_2)
summary(df_2)

#set missing as zeros
df_2[is.na(df_2)] <- 0 

df_2$audience = 1   
print(df_2)

# Combine 2 data frames 

#### combine rows of df
df1_df2 = rbind(df_1,df_2)
print(df1_df2)

# this works except the y-axis is the count and I want the percent.
# So all the blacks would add to 100%, and all the orange would add to 100%
a <- ggplot(data = df1_df2, aes(x = factor(gender), fill = factor(audience))) +
  scale_fill_manual(values=c("#0B2632","#FF5D00"))+
  geom_bar(stat="count", position = position_dodge(preserve="single"))
a

# this is what I attempted but it doesn't work. And I'm unsure if it would be the right thing anyways. 
percent <- ddply(df1_df2, "gender", transform,
                 percent_gender = (count/sum(count))*100)
percent

# error:
#Error in sum(count) : invalid 'type' (closure) of argument

a <- ggplot(data = percent, aes(x = factor(gender), y=percent_gender, fill = factor(audience))) +
  scale_fill_manual(values=c("#0B2632","#FF5D00"))+
  geom_bar(stat="identity", position = position_dodge)
a
1 Like

Hi @shp5009,

Is this what you were hoping for the plot to look like:

df1_df2 %>% 
  group_by(audience) %>% 
  count(gender) %>% 
  mutate(pct = n / sum(n)) %>% 
  ggplot(aes(gender, pct, fill = factor(audience))) +
  geom_col(position = 'dodge') + 
  scale_fill_manual(values = c("#0B2632", "#FF5D00")) +
  scale_y_continuous(labels = scales::percent_format())

1 Like

Perfect! Thank you. I need to understand piping better. I can't seem to write it myself.

No problem. Piping with the magrittr pipe is great.

For basic use, the pipe takes the output from the first function and passes it as the first argument to the second function. The reason it works well with dplyr/tidyverse functions, is that almost all of the functions return data frames as their output, and accept data frames as their first argument, which makes them highly pipeable.

In my above code example, group_by(), count(), mutate(), and ggplot() all accept data as the first argument, which means we can pipe into these functions without explicitly specifying the data (it automatically gets piped along).

Thanks for the explanation. That helps.

1 Like

Maybe, I spoke too soon. Your solution is what I want.
For some reason, I get the following error:
Error in count(., gender) : object 'gender' not found.

I reinstalled plyr which seems to be where count() is located, but that didn't work either.

Sorry, that is my fault. I should have provided a self-contained solution. Make sure to load the dplyr package. It contains the pipe, group_by(), count(), and mutate()

So, I still couldn't get it to work. I added the dplyr:: before count. I ended up getting it to run.
However, the y-axis isn't the same as yours for some reason. Both the black and orange add to 100% together instead of separately like yours does. I don't think mine is any different than yours except for the dplyr::

I'm stumped why mine is different. I need it like you did it.


#library(tidyverse)  # may or may not need.
library(dplyr)
library(plyr)
library(ggplot2)


## Data for national data set. 
df_1 <- data.frame(
  categorical = c("A","B","C","A","B","A","C","C","C","A","A","C","C","C","A","C","A","B","A","C"),
  indicator1   = c(1,0,1,NA,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1),
  indicator2   = c(1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0),
  indicator3   = c(1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,1,0),
  indicator4   = c(0,1,0,0,1,0,0,0,0,0,1,1,0,NA,1,0,0,0,0,0),
  indicator5   = c(0,0,1,1,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,0),
  gender       = c("M","M","NA","F","F","F","F","F","M","U","U","F","M","M","F","F","F","U","M","F"),
  continuous1 = c(2.3,3.4,6.6,5.5,6,7,11,12.3,13,5,2.4,3.6,6.3,5.2,5,6.6,11.3,12,14,5))

print(df_1)
summary(df_1)

#set missing as zeros
df_1[is.na(df_1)] <- 0  

df_1$audience = 0
print(df_1)

## Data for client data set. 
df_2 <- data.frame(
  categorical = c("A","B","C","A","B","A","C","C","C","A"),
  indicator1   = c(1,1,1,1,1,1,1,0,0,0),
  indicator2   = c(1,1,1,1,NA,1,1,1,1,1),
  indicator3   = c(0,NA,1,0,1,0,0,0,0,0),
  indicator4   = c(1,1,1,1,1,0,0,0,0,1),
  indicator5   = c(1,1,1,1,1,0,1,0,1,1),
  continuous1 = c(2.3,3.4,6.6,5.5,6,7,11,12.3,13,5),
  gender = c("M","F","F","F","F","M","M","M","F","F"))

print(df_2)
summary(df_2)

#set missing as zeros
df_2[is.na(df_2)] <- 0 

df_2$audience = 1   
print(df_2)

# Combine 2 data frames so that we end up having the same number of columns for both data frames. 

#### combine rows or df
df1_df2 = rbind(df_1,df_2)
print(df1_df2)

### this is from r studio community
df1_df2 %>% 
  group_by(audience) %>% 
  dplyr::count(gender) %>% 
  mutate(pct = n / sum(n)) %>% 
  ggplot(aes(gender, pct, fill = factor(audience))) +
  geom_col(position = "dodge") + 
  scale_fill_manual(values = c("#0B2632", "#FF5D00")) +
  scale_y_continuous(labels = scales::percent_format())

image

Hi @shp5009,

Here's a tidyverse version of your code that does what you want:

library(tidyverse)  # use since it includes both dplyr and ggplot2

## Data for national data set. 
df_1 <- tibble(
  categorical = c("A","B","C","A","B","A","C","C","C","A","A","C","C","C","A","C","A","B","A","C"),
  indicator1   = c(1,0,1,NA,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1),
  indicator2   = c(1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0),
  indicator3   = c(1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,1,0),
  indicator4   = c(0,1,0,0,1,0,0,0,0,0,1,1,0,NA,1,0,0,0,0,0),
  indicator5   = c(0,0,1,1,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,0),
  gender       = c("M","M",NA,"F","F","F","F","F","M","U","U","F","M","M","F","F","F","U","M","F"),
  continuous1 = c(2.3,3.4,6.6,5.5,6,7,11,12.3,13,5,2.4,3.6,6.3,5.2,5,6.6,11.3,12,14,5))

#set missing gender as zeros, add audience column set to 0
df_1 <- 
  df_1 %>% 
  mutate_all(replace_na, 0) %>% 
  mutate(audience = 0)
head(df_1)
#> # A tibble: 6 x 9
#>   categorical indicator1 indicator2 indicator3 indicator4 indicator5 gender
#>   <chr>            <dbl>      <dbl>      <dbl>      <dbl>      <dbl> <chr> 
#> 1 A                    1          1          1          0          0 M     
#> 2 B                    0          1          0          1          0 M     
#> 3 C                    1          1          1          0          1 0     
#> 4 A                    0          0          0          0          1 F     
#> 5 B                    1          0          1          1          1 F     
#> 6 A                    0          0          0          0          0 F     
#> # … with 2 more variables: continuous1 <dbl>, audience <dbl>
## Data for client data set. 
df_2 <- tibble(
  categorical = c("A","B","C","A","B","A","C","C","C","A"),
  indicator1   = c(1,1,1,1,1,1,1,0,0,0),
  indicator2   = c(1,1,1,1,NA,1,1,1,1,1),
  indicator3   = c(0,NA,1,0,1,0,0,0,0,0),
  indicator4   = c(1,1,1,1,1,0,0,0,0,1),
  indicator5   = c(1,1,1,1,1,0,1,0,1,1),
  continuous1 = c(2.3,3.4,6.6,5.5,6,7,11,12.3,13,5),
  gender = c("M","F","F","F","F","M","M","M","F","F"))

#set missing numerical data to 0, add audience columns set to 1
df_2 <- 
  df_2 %>% 
  mutate_all(replace_na, 0) %>% 
  mutate(audience = 1)
head(df_2)
#> # A tibble: 6 x 9
#>   categorical indicator1 indicator2 indicator3 indicator4 indicator5
#>   <chr>            <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
#> 1 A                    1          1          0          1          1
#> 2 B                    1          1          0          1          1
#> 3 C                    1          1          1          1          1
#> 4 A                    1          1          0          1          1
#> 5 B                    1          0          1          1          1
#> 6 A                    1          1          0          0          0
#> # … with 3 more variables: continuous1 <dbl>, gender <chr>, audience <dbl>
# Combine 2 data frames 
df1_df2 <-df_1 %>% union(df_2)

#plot
df1_df2 %>% 
  group_by(audience) %>% 
  count(gender) %>% 
  mutate(pct = n / sum(n)) %>% 
  ggplot(aes(gender, pct, fill = factor(audience))) +
  geom_col(position = 'dodge') + 
  scale_fill_manual(values = c("#0B2632", "#FF5D00")) +
  scale_y_continuous(labels = scales::percent_format())

Created on 2020-03-01 by the reprex package (v0.3.0)

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.