Tidyverse ggplot() help: How to segment each column in the geom_bar()

rstudio

#1

Hello,

I have a data set

Summary
structure(list(PROGRAM_LEVEL_DESCR = structure(c(1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 14L, 14L, 
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L), .Label = c("Branch Refusal", 
"Club", "Corporate Refusal", "Credit Hold", "Customer Refusal", 
"Diamond", "Enrollment", "Failed 2X in Calendar Year", "Gold", 
"Institutional", "No Program", "Platinum", "RSVP", "Silver"), class = "factor"), 
    category = structure(c(1L, 2L, 3L, 5L, 7L, 1L, 2L, 3L, 4L, 
    5L, 7L, 8L, 10L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 
    1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L, 3L, 1L, 2L, 3L, 4L, 
    5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
    10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 
    4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 2L, 3L, 4L, 5L, 6L, 
    7L, 8L, 9L, 10L), .Label = c("1", "2-4", "5-7", "8-10", "11-15", 
    "16-20", "21-30", "31-40", "41-50", "> = 51"), class = "factor"), 
    count = c(2L, 1L, 1L, 1L, 1L, 3L, 5L, 3L, 1L, 4L, 1L, 1L, 
    1L, 12L, 14L, 6L, 1L, 2L, 4L, 5L, 5L, 2L, 1L, 14L, 22L, 7L, 
    8L, 5L, 3L, 1L, 1L, 4L, 1L, 16L, 18L, 12L, 9L, 11L, 3L, 6L, 
    3L, 1L, 3L, 10L, 21L, 9L, 3L, 5L, 2L, 2L, 2L, 1L, 779L, 918L, 
    294L, 140L, 104L, 60L, 56L, 21L, 14L, 45L, 12L, 17L, 8L, 
    3L, 3L, 4L, 4L, 2L, 2L, 3L, 1L, 20L, 55L, 26L, 16L, 27L, 
    11L, 20L, 3L, 4L, 9L), percent = c("33.3%", "16.7%", "16.7%", 
    "16.7%", "16.7%", "15.8%", "26.3%", "15.8%", "5.3%", "21.1%", 
    "5.3%", "5.3%", "5.3%", "34.3%", "40%", "17.1%", "2.9%", 
    "5.7%", "23.5%", "29.4%", "29.4%", "11.8%", "5.9%", "21.5%", 
    "33.8%", "10.8%", "12.3%", "7.7%", "4.6%", "1.5%", "1.5%", 
    "6.2%", "100%", "19.5%", "22%", "14.6%", "11%", "13.4%", 
    "3.7%", "7.3%", "3.7%", "1.2%", "3.7%", "18.2%", "38.2%", 
    "16.4%", "5.5%", "9.1%", "3.6%", "3.6%", "3.6%", "1.8%", 
    "32%", "37.8%", "12.1%", "5.8%", "4.3%", "2.5%", "2.3%", 
    "0.9%", "0.6%", "1.9%", "20.7%", "29.3%", "13.8%", "5.2%", 
    "5.2%", "6.9%", "6.9%", "3.4%", "3.4%", "5.2%", "100%", "10.5%", 
    "28.8%", "13.6%", "8.4%", "14.1%", "5.8%", "10.5%", "1.6%", 
    "2.1%", "4.7%")), class = c("grouped_df", "tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -83L), vars = "PROGRAM_LEVEL_DESCR", labels = structure(list(
    PROGRAM_LEVEL_DESCR = structure(c(1L, 2L, 4L, 6L, 7L, 8L, 
    9L, 10L, 11L, 12L, 13L, 14L), .Label = c("Branch Refusal", 
    "Club", "Corporate Refusal", "Credit Hold", "Customer Refusal", 
    "Diamond", "Enrollment", "Failed 2X in Calendar Year", "Gold", 
    "Institutional", "No Program", "Platinum", "RSVP", "Silver"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-12L), vars = "PROGRAM_LEVEL_DESCR", drop = TRUE), indices = list(
    0:4, 5:12, 13:17, 18:22, 23:31, 32L, 33:42, 43:51, 52:61, 
    62:71, 72L, 73:82), drop = TRUE, group_sizes = c(5L, 8L, 
5L, 5L, 9L, 1L, 10L, 9L, 10L, 10L, 1L, 10L), biggest_group_size = 10L)

I have a sample plot which I do not know how to fix/plot the way I intended to.

My code:

Summary
df2a1 <- df2a %>% group_by(PROGRAM_LEVEL_DESCR,category) %>% summarise(count=n()) %>% 
  mutate(percent= paste0(round(count/sum(count)*100,1),'%')) 

plot_df2a1 = ggplot(df2a1) + geom_bar(aes(x=category,y=count, fill = PROGRAM_LEVEL_DESCR),stat='identity') + 
  labs(y='Number of Distinct Customers',x=' # of PL Orders in the PL Cart')+ 
  geom_text(aes(x=category,y=count,label=percent),vjust=-0.5) 

My goal:

  • First, I still want my x-axis to be category and my y-axis to be number of customers.
  • Second, I want to see the segmentation within each column for each category marked on the x-axis if my description makes sense. For instance, for the bar at x = 1, I know there are 32% of customers buy only 1 item, but I do want to know what groups make up of the whole column of 32%, based on my column in the data set called "Customer Program Description".
    For example, if I am making sense,

    for category "1", there are 10 different customer segments within this category, so I want to implement these segments in this column.

I hope my post is detailed enough.

Please keep me posted with the feedback.
Thanks, team!


#2

I'm not sure if I understand what you're trying to do, so please let me know if I'm on the right track. Do you want the percentage of PROGRAM_LEVEL_DESCR within each category? If so, change the grouping order:

df2a1 <- df2a %>% 
  group_by(category, PROGRAM_LEVEL_DESCR) %>% 
  summarise(count=n()) %>% 
  mutate(percent= paste0(round(count/sum(count)*100,1),'%'))

This can also be coded as follows, where I've used the tally function and also avoided rounding and pasting (we can do that inside ggplot without need to alter the percent values produced by mutate):

df2a1 <- df2a %>% 
  group_by(category, PROGRAM_LEVEL_DESCR) %>% 
  tally %>% 
  mutate(percent=n/sum(n))

Then the plot would be as follows, where I've (1) moved the x and y aesthetics to the main ggplot call, since they apply to both geoms, (2) added position=position_stack(vjust=0.5) to geom_text, which will get the text labels vertically centered within each bar, and (3) used the sprintf function to format the percent labels.

ggplot(df2a1, aes(x=category, y=count)) + 
  geom_bar(aes(fill = PROGRAM_LEVEL_DESCR),stat='identity') + 
  geom_text(aes(label=sprintf("%1.1f%%", percent*100)), position=position_stack(vjust=0.5), size=3, colour="white") +
  labs(y='Number of Distinct Customers', x=' # of PL Orders in the PL Cart')

#3

Joels:

 ggplot(df2a1, aes(x=category, y=count)) + 
+   geom_bar(aes(fill = PROGRAM_LEVEL_DESCR),stat='identity') + 
+   geom_text(aes(label=sprintf("%1.1f%%", percent*100)), position=position_stack(vjust=0.5), size=3, colour="white") +
+   labs(y='Number of Distinct Customers', x=' # of PL Orders in the PL Cart')
Error in percent * 100 : non-numeric argument to binary operator

Please help!


#4

If you're using your original df2a1, then percent is a vector of character strings. To use the plot code I provided, you need to leave percent as a numeric variable (as in the second version of the code where I create df2a1) .


#5
> df2a1 <- df2a %>% 
+   group_by(category, PROGRAM_LEVEL_DESCR) %>% 
+   tally %>% 
+   mutate(percent=n/sum(n))
> View(df2a1)
> ggplot(df2a1, aes(x=category, y=count)) + 
+   geom_bar(aes(fill = PROGRAM_LEVEL_DESCR),stat='identity') + 
+   geom_text(aes(label=sprintf("%1.1f%%", percent*100)), position=position_stack(vjust=0.5), size=3, colour="white") +
+   labs(y='Number of Distinct Customers', x=' # of PL Orders in the PL Cart')
Don't know how to automatically pick scale for object of type function. Defaulting to continuous.
Error in (function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,  : 
  arguments imply differing number of rows: 113, 0


#6

@joels
Joel:
Do you mind if I tag you in this post? If not, I won't tag you. Let me know!

You have been helpful.
Thank you.


#7

@joels
Joel:
I got the plot but my thought on the 0.0% error is as follow:


Is there a way for me to give percentage of each Program_level_Desr in category 0%?
Then we can fix the 0.0% error.
Am I clear enough?
For example, for the "0%" category, there are 14 different levels. I want to see the percentage for each of them in the "0%" category. So for the first one would be 133/total *100.

Thanks!


#8

It will be much easier to troubleshoot this if you can put it in a reproducible example, which you can do with reprex. That way your input, output, and charts will all be self-contained so anyone can just copy and paste to replicate exactly what you're doing.

You can install reprex, as shown below.

install.reprex("reprex")
What to do if you run into clipboard problems

If you run into problems with access to your clipboard, you can specify an outfile for the reprex, and then copy and paste the contents into the forum.

reprex::reprex(input = "fruits_stringdist.R", outfile = "fruits_stringdist.md")

For pointers specific to the community site, as well as several helpful resources for learning more about reprex, check out the reprex FAQ, linked to below.


#10

If I understand you correctly, I think you can do this with a group_by(category) and mutate(prct_within_cat = count / sum(count))
And then in your plot geom_text, refer to prct_within_cat instead of percent*100


#11

To Clarify my post:


Recall that each customer has a unique Rewards/Advantage level.
So for the first category: "0%", there are 14,998 customers who do not buy PL, and they occupy about 32.5% out of the customers pool.

Next, I want to dig deeply into this number. If you want to do a marketing campaign and you want to target the RIGHT audiences, you need to see the segmentation within each column. So long story short, I want to see in this 32.5%, how many of them are "Silver" customers, how many of them are "Non-Program" customers.

Why? Because I need to go out and talk to them. "Hey, why don't you buy PL? you are diamond or silver customers. You have stayed with us for a long time. Did PL products disappoint you? If so, tell me more about it..."

To go back to your question, it can be similarly answered to the info above. For column "1 PL order", I want to see the segments in it. "
Hey, why don't you buy more than 1 PL? You are diamond or silver customers. You have stayed with us for a long time. Did PL products disappoint you? If so, tell me more about it... Is the price too expensive? Would you interested in buying more than one in the future...."

Am I making sense?


#12
> df2a2 <- df2a %>% group_by(category) %>% summarise(count=n()) %>% 
+   mutate(percent = round(count/sum(count)*100))

> plot_df2a2 = ggplot(df2a2) + geom_bar(aes(x=category,y=count, fill = PROGRAM_LEVEL_DESCR),stat='identity') + 
+   labs(y='Number of Distinct Customers',x=' # of PL Orders in the PL Cart')+ 
+   geom_text(aes(x=category,y=count,label=percent),vjust=-0.5)
> plot_df2a2

'Error: Aesthetics must be either length 1 or the same as the data (9): x, y, fill

#13

The point of a reprex is that it's self-contained so that rather than describing what's in the data, others can see a sample of the data and help you modify your code, etc.

Jenny describes it really well in this video (starts ~10:40).


#14

Reprex output:

library(tidyverse)

df3 = data %>% filter(QtySold > 0L, Sales > 0L) %>%
  group_by(CUSTOMER_NUMBER,PRODUCT_SUB_LINE_DESCR,PROGRAM_LEVEL_DESCR ) %>% 
  summarise(Quants = sum(QtySold )) %>%
  ungroup() %>%
  spread(PRODUCT_SUB_LINE_DESCR,Quants,fill=0) %>%
  mutate(Total_Orders = `PRIVATE LABEL` + SUNDRY + Handpieces,
         PL_Order_Percentage= round((`PRIVATE LABEL` / Total_Orders) * 100),
         category = cut(PL_Order_Percentage,breaks = c(0,1,11,21,31,41,51,61,71,Inf), 
                        labels = c('0%','1%-10%','11%-20%',
                                   '21%-30%','31%-40%','41%-50%',
                                   '51%-60%','61%-70%','>= 71%'),include.lowest = T,right = F)
  ) 
#> Error in UseMethod("filter_"): no applicable method for 'filter_' applied to an object of class "function"

df3a <- df3 %>% 
  group_by(category, PROGRAM_LEVEL_DESCR) %>% 
  summarise(count=n(),
            percent = count/sum(count))
#> Error in eval(lhs, parent, parent): object 'df3' not found

df3a %>% 
  ggplot(aes(x=category, y=count)) + 
  geom_bar(aes(fill = PROGRAM_LEVEL_DESCR),stat='identity') +
  labs(y='Number of Distinct Customers', x=' # of PL Orders in the PL Cart') +
  geom_text(aes(label=sprintf("%1.1f%%", percent)), 
            position=position_stack(vjust=0.5), size=3, colour="white")
#> Error in eval(lhs, parent, parent): object 'df3a' not found

Created on 2018-06-29 by the reprex
package
(v0.2.0).


#15

Thanks for giving reprex() a try. You’re almost there.

This error:
#> Error in UseMethod("filter_"): no applicable method for 'filter_' applied to an object of class "function"

is appearing (and causing the cascade of other errors) because you haven’t included code that creates your data in the chunk of code that you applied reprex() to, so R thinks data refers to a built-in R function by that name. reprex() runs in its own separate R session, so your code really does need to be self-contained — it can’t access objects in the session and environment that you called reprex() from.

The quickest path to a functional self-contained example here is to choose one of the methods in this thread to use to include code that creates a sample data object named data in your reprex()ed chunk.


#16

@jcblum
Reprex trial #2

Body is limited to 32000 characters; you entered 72041.

I am super confused. Everyone posts comments, guidelines, etc. but it does not mean the readers understand completely.
This is just my opinion!


#17

If your dataset is that big, the next steps are:

  1. Consider whether your question can be answered with a subset or random sample of your data (the answer is almost always yes...). If so, provide this instead of your whole data set.

  2. If your question really relies on having the whole dataset, you’ll need to post that part of the code separately, as a github gist or similar (see here: (How to upload or share data files here)

I know that figuring out how to pose your questions this way has a learning curve and that can be frustrating when you just want to get to the solving-your-problem part. But once you’ve wrapped your head around it, there are major benefits. You get a clearer picture of your problem by structuring your question in a self-contained, minimally complex way. You spend less time going back and forth with your helpers trying to explain what you mean. More people want to help with your questions because they’re easier and more fun to dig into.

And like with everything else, it’s totally ok to be confused and make mistakes as you go along! No judgement from me as long as you’re making the effort.


#19

@jcblum

library(tidyverse) 
library(scales) 
#> 
#> Attaching package: 'scales'
#> The following object is masked from 'package:purrr':
#> 
#>     discard
#> The following object is masked from 'package:readr':
#> 
#>     col_factor
library(cowplot) 
#> 
#> Attaching package: 'cowplot'
#> The following object is masked from 'package:ggplot2':
#> 
#>     ggsave
library(dplyr)
library(ggplot2)
library(reprex)
library("datapasta")
df3a = structure(list(category = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
                                             1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                             2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
                                             4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 
                                             5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 
                                             6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
                                             7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 
                                             9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("0%", 
                                                                                                     "1%-10%", "11%-20%", "21%-30%", "31%-40%", "41%-50%", "51%-60%", 
                                                                                                     "61%-70%", ">= 71%"), class = "factor"), PROGRAM_LEVEL_DESCR = structure(c(1L, 
                                                                                                                                                                                2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 
                                                                                                                                                                                2L, 4L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 14L, 1L, 2L, 3L, 4L, 6L, 
                                                                                                                                                                                7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 6L, 7L, 
                                                                                                                                                                                9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
                                                                                                                                                                                9L, 10L, 11L, 12L, 13L, 14L, 2L, 3L, 4L, 6L, 7L, 8L, 9L, 10L, 
                                                                                                                                                                                11L, 12L, 14L, 1L, 2L, 3L, 4L, 6L, 7L, 9L, 10L, 11L, 12L, 13L, 
                                                                                                                                                                                14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 14L, 
                                                                                                                                                                                1L, 2L, 3L, 4L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L), .Label = c("Branch Refusal", 
                                                                                                                                                                                                                                                     "Club", "Corporate Refusal", "Credit Hold", "Customer Refusal", 
                                                                                                                                                                                                                                                     "Diamond", "Enrollment", "Failed 2X in Calendar Year", "Gold", 
                                                                                                                                                                                                                                                     "Institutional", "No Program", "Platinum", "RSVP", "Silver"), class = "factor"), 
                      count = c(133L, 172L, 5L, 215L, 1L, 104L, 389L, 13L, 843L, 
                                193L, 10743L, 482L, 10L, 1695L, 3L, 383L, 59L, 471L, 98L, 
                                2L, 1675L, 87L, 1284L, 1719L, 1351L, 6L, 290L, 3L, 39L, 262L, 
                                85L, 3L, 1123L, 76L, 1255L, 1003L, 1L, 1000L, 3L, 208L, 5L, 
                                31L, 189L, 69L, 731L, 79L, 979L, 670L, 1L, 732L, 1L, 156L, 
                                8L, 33L, 1L, 127L, 70L, 1L, 547L, 55L, 967L, 480L, 1L, 568L, 
                                150L, 5L, 31L, 85L, 65L, 2L, 416L, 38L, 907L, 319L, 531L, 
                                1L, 102L, 14L, 18L, 63L, 35L, 307L, 25L, 533L, 236L, 2L, 
                                317L, 3L, 90L, 18L, 22L, 1L, 33L, 38L, 1L, 254L, 25L, 640L, 
                                180L, 275L, 8L, 179L, 48L, 76L, 100L, 150L, 5L, 503L, 95L, 
                                4032L, 339L, 2L, 812L)), class = c("grouped_df", "tbl_df", 
                                                                   "tbl", "data.frame"), row.names = c(NA, -113L), vars = "category", drop = TRUE)
df3a %>% 
  ggplot(aes(x=category, y=count)) + 
  geom_bar(aes(fill = PROGRAM_LEVEL_DESCR),stat='identity') +
  labs(y='Number of Distinct Customers', x=' # of PL Orders in the PL Cart') +
  geom_text(aes(label=sprintf("%1.1f%%", percent)), 
            position=position_stack(vjust=0.5), size=3, colour="white")
#> Error in as.double(function (x) : cannot coerce type 'closure' to vector of type 'double'

Created on 2018-06-29 by the reprex
package
(v0.2.0).


#20

@mara
To clarify your comment, my post above was meant to describe my goal i.e what I wanted with the data. I by no means tried to describe what the data is.


#21

Right, my point was really that it is much easier to help you (or anyone) when we have a self-contained, reproducible example. Screenshots of what's in the data are not nearly so helpful as what you put:


since, with the latter, anyone can literally copy and paste and run your code to see if their suggested changes make the difference you've described or help you achieve your aim.


#22

Mara:
Any input how to solve this problem?

Thanks!