Bar plot with two data frames

Hello!

I want to do a bar plot with mean and standard deviation, where the bars come from two different data frames. For example, in the plot there would be Q1 and Q2 from both data frames. And I have no idea how to do this. Can anyone help?

The kind of plot ↓

ggplot(subset(data, question %in% c("Q1", "Q2")), aes(x=question, y=mean)) +
  geom_bar(stat ="identity", width = 0.5)+
  geom_errorbar(aes(ymin = mean-sd, ymax = mean+sd, width = 0.5)) +
  coord_flip()

The data frames are ↓

structure(list(question = c("Q1", "Q10", "Q11", "Q12", "Q2", 
"Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9"), n = c(204L, 204L, 
204L, 204L, 204L, 204L, 204L, 204L, 204L, 204L, 204L, 204L), 
    mean = c(5.22549019607843, NA, 4.95098039215686, 4.39705882352941, 
    5.47058823529412, 5.51470588235294, 4.50490196078431, 4.92647058823529, 
    4.40686274509804, 5.56862745098039, 5.56372549019608, 5.23529411764706
    ), sd = c(1.1524816893289, NA, 1.31214449357814, 1.5422430010719, 
    1.12039650223724, 1.15104553532809, 1.37714471881058, 1.34621721218454, 
    1.30030385262334, 0.871099231072865, 0.830963499839951, 1.36945187401243
    )), row.names = c(NA, 12L), class = c("tbl_df", "tbl", "data.frame"
))

structure(list(question = c("Q1", "Q10", "Q11", "Q12", "Q2", 
"Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9"), n = c(13L, 13L, 13L, 
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L), mean = c(5.38461538461539, 
4.38461538461539, 4.69230769230769, 4.30769230769231, 5.15384615384615, 
5.38461538461539, 4.76923076923077, 5.30769230769231, 4.53846153846154, 
5.61538461538461, 5.69230769230769, 4.92307692307692), sd = c(1.26085034391223, 
1.44559454541846, 1.03155347127648, 1.60128153805087, 0.898717034272917, 
1.12089707663561, 1.01273936708367, 0.85485041426511, 0.967417922046845, 
1.26085034391223, 0.85485041426511, 1.84668795692624)), row.names = c(NA, 
12L), class = c("tbl_df", "tbl", "data.frame"))

Something like this?

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(ggplot2)

data_1 <- structure(list(
  question = c(
    "Q1", "Q10", "Q11", "Q12", "Q2",
    "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9"
  ), n = c(
    204L, 204L,
    204L, 204L, 204L, 204L, 204L, 204L, 204L, 204L, 204L, 204L
  ),
  mean = c(
    5.22549019607843, NA, 4.95098039215686, 4.39705882352941,
    5.47058823529412, 5.51470588235294, 4.50490196078431, 4.92647058823529,
    4.40686274509804, 5.56862745098039, 5.56372549019608, 5.23529411764706
  ), sd = c(
    1.1524816893289, NA, 1.31214449357814, 1.5422430010719,
    1.12039650223724, 1.15104553532809, 1.37714471881058, 1.34621721218454,
    1.30030385262334, 0.871099231072865, 0.830963499839951, 1.36945187401243
  )
), row.names = c(NA, 12L), class = c("tbl_df", "tbl", "data.frame"))

data_2 <- structure(list(question = c(
  "Q1", "Q10", "Q11", "Q12", "Q2",
  "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9"
), n = c(
  13L, 13L, 13L,
  13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L
), mean = c(
  5.38461538461539,
  4.38461538461539, 4.69230769230769, 4.30769230769231, 5.15384615384615,
  5.38461538461539, 4.76923076923077, 5.30769230769231, 4.53846153846154,
  5.61538461538461, 5.69230769230769, 4.92307692307692
), sd = c(
  1.26085034391223,
  1.44559454541846, 1.03155347127648, 1.60128153805087, 0.898717034272917,
  1.12089707663561, 1.01273936708367, 0.85485041426511, 0.967417922046845,
  1.26085034391223, 0.85485041426511, 1.84668795692624
)), row.names = c(
  NA,
  12L
), class = c("tbl_df", "tbl", "data.frame"))

bind_rows(data_1, data_2, .id = "id") %>%
  filter(question %in% c("Q1", "Q2")) %>%
  ggplot(aes(x = question, y = mean)) +
  geom_bar(aes(fill = id), stat = "identity", position = "dodge2", width = 0.5) +
  geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, width = 0.5, colour = id)) +
  coord_flip()

1 Like

Thanks for introducing me bind_rows, this is almost perfect. Now I only need to find out how to tell which bar is from which data frame. There is one flaw though. The error bars should be in the middle of the bars.

If you notice, there's a legend using the column "id". This column is created by bind_rows while concatenating the two dataset. It assigns numbers in the same order of the datasets in which they are provided to the function call.

Unfortunately, I can't help you with this. I just used your snippet. I don't use ggplot2 much myself. Sorry, hopefully someone else will help you out.