Creating grouped bar chart with multiple numeric columns

So I'm really new to R and am trying to create some figures in RStudio but I am having a hard time getting my plots to work. I've included my code and the output below. Basically, I want to create a group bar chart where I have the information grouped by the treatment group, but then each of the "_avg" columns is one variable that I want plotted as a y. I'm not sure how to set the y and fill to reflect this. Any input is helpful.

Code:

# RSTUDIO PACKAGE INSTALLATION ################################################

library(janitor)
library(tidyverse)
library(ggplot2)
library(readr)
library(dplyr)

# IMPORT THE DATA #############################################################
organic_acids<-read.csv(file='OCT19_Queue_organic acid summary8.csv')


organic_acids
summary(organic_acids)

# FORMATTING SPREADSHEET COLUMN NAMES AND TYPES ###############################

colnames(organic_acids)<-gsub("X..","",colnames(organic_acids))

colnames(organic_acids)<-gsub("..db.","",colnames(organic_acids))

colnames(organic_acids)<-gsub(".Acid","_Acid",colnames(organic_acids))

colnames(organic_acids)<-gsub("ï..","",colnames(organic_acids))

organic_acids<-organic_acids %>% 
  rename("experiment_id"='Experiment.ID',
           'sample_name'='OA.sample.name',
         'treatment_group'='Sample_overview',
         'wet_biomass_extracted_g'='Wet.Biomass.extracted',
         'mc_content'='Moisture.content',
         'dry_biomass_g'='Dry.Biomass.extracted')

organic_acids<-organic_acids %>% 
  mutate(across(contains(c("mc_content","MC_avg","_Acid","A_avg","total_avg","A_stdev","total_stdev")),parse_number))

organic_acids
                                     
# summary(organic_acids)
# view(organic_acids)
# sapply(organic_acids,class)

# CREATING DATA PLOTS #########################################################

plotting_variables<-organic_acids %>% 
  select(treatment_group,SA_avg,LA_avg,FA_avg,AA_avg,PA_avg,IBA_avg,BA_avg,IVA_avg,VA_avg,
         OA_total_avg)

plotting_variables_remove_na<-na.omit(plotting_variables)

plotting_variables_remove_na

numeric_variables<-plotting_variables_remove_na %>% 
  group_by(SA_avg,LA_avg,FA_avg,AA_avg,PA_avg,IBA_avg,BA_avg,IVA_avg,VA_avg,
           OA_total_avg)

numeric_variables

ggplot(plotting_variables_remove_na,aes(fill=numeric_variables,
                                        y=numeric_variables,x=treatment_group))+
  geom_bar()

Output:

> plotting_variables<-organic_acids %>% 
+   select(treatment_group,SA_avg,LA_avg,FA_avg,AA_avg,PA_avg,IBA_avg,BA_avg,IVA_avg,VA_avg,
+          OA_total_avg)
> plotting_variables_remove_na<-na.omit(plotting_variables)
> plotting_variables_remove_na
           treatment_group SA_avg LA_avg FA_avg AA_avg PA_avg IBA_avg BA_avg IVA_avg VA_avg OA_total_avg
1          Initial biomass   0.23   0.00   0.00   0.02   0.05    0.00   0.00    0.00   0.00         0.30
2  Initial biomass with CA   0.22   0.00   0.00   0.01   0.04    0.00   0.00    0.00   0.00         0.27
3                  Aer_4hr   2.34   0.02   0.03   0.05   0.70    0.00   0.00    0.01   0.00         3.15
6                  Aer_8hr   2.85   0.04   0.05   0.10   0.95    0.00   0.01    0.02   0.00         4.00
9                 Aer_24hr   1.58   0.04   0.02   0.08   0.63    0.00   0.02    0.00   0.00         2.37
12               Aer_28day   0.68   0.15   0.34   5.02   3.27    0.25   4.27    0.97   1.32        16.27
15                 Ana_4hr   0.71   0.00   0.01   0.02   0.23    0.00   0.00    0.00   0.00         0.98
18                 Ana_8hr   2.12   0.02   0.06   0.04   0.70    0.00   0.01    0.00   0.00         2.96
21                Ana_24hr   2.10   0.07   0.06   0.04   0.71    0.00   0.03    0.00   0.00         2.99
24               Ana_28day   1.01   0.30   0.27   3.06   2.78    0.04   2.50    0.42   0.00        10.39
26                  CA_4hr   2.05   0.14   0.05   0.01   0.68    0.00   0.00    0.00   0.00         2.93
29                  CA_8hr   1.92   0.31   0.06   0.01   0.78    0.01   0.00    0.00   0.00         3.09
32                 CA_24hr   1.79   0.28   0.04   0.02   0.62    0.00   0.00    0.00   0.00         2.76
35                CA_28day  29.68   1.30   0.43   2.49   2.09    0.00   0.03    2.48   0.00        38.50
> numeric_variables<-plotting_variables_remove_na %>% 
+   group_by(SA_avg,LA_avg,FA_avg,AA_avg,PA_avg,IBA_avg,BA_avg,IVA_avg,VA_avg,
+            OA_total_avg)
> numeric_variables
# A tibble: 14 x 11
# Groups:   SA_avg, LA_avg, FA_avg, AA_avg, PA_avg, IBA_avg, BA_avg, IVA_avg, VA_avg, OA_total_avg [14]
   treatment_group         SA_avg LA_avg FA_avg AA_avg PA_avg IBA_avg BA_avg IVA_avg VA_avg OA_total_avg
   <chr>                    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>   <dbl>  <dbl>        <dbl>
 1 Initial biomass           0.23   0      0      0.02   0.05    0      0       0      0            0.3 
 2 Initial biomass with CA   0.22   0      0      0.01   0.04    0      0       0      0            0.27
 3 Aer_4hr                   2.34   0.02   0.03   0.05   0.7     0      0       0.01   0            3.15
 4 Aer_8hr                   2.85   0.04   0.05   0.1    0.95    0      0.01    0.02   0            4   
 5 Aer_24hr                  1.58   0.04   0.02   0.08   0.63    0      0.02    0      0            2.37
 6 Aer_28day                 0.68   0.15   0.34   5.02   3.27    0.25   4.27    0.97   1.32        16.3 
 7 Ana_4hr                   0.71   0      0.01   0.02   0.23    0      0       0      0            0.98
 8 Ana_8hr                   2.12   0.02   0.06   0.04   0.7     0      0.01    0      0            2.96
 9 Ana_24hr                  2.1    0.07   0.06   0.04   0.71    0      0.03    0      0            2.99
10 Ana_28day                 1.01   0.3    0.27   3.06   2.78    0.04   2.5     0.42   0           10.4 
11 CA_4hr                    2.05   0.14   0.05   0.01   0.68    0      0       0      0            2.93
12 CA_8hr                    1.92   0.31   0.06   0.01   0.78    0.01   0       0      0            3.09
13 CA_24hr                   1.79   0.28   0.04   0.02   0.62    0      0       0      0            2.76
14 CA_28day                 29.7    1.3    0.43   2.49   2.09    0      0.03    2.48   0           38.5 
> ggplot(plotting_variables_remove_na,aes(fill=numeric_variables,
+                                         y=numeric_variables,x=treatment_group))+
+   geom_bar()
Don't know how to automatically pick scale for object of type grouped_df/tbl_df/tbl/data.frame. Defaulting to continuous.
Don't know how to automatically pick scale for object of type grouped_df/tbl_df/tbl/data.frame. Defaulting to continuous.
Error in `check_aesthetics()`:
! Aesthetics must be either length 1 or the same as the data (14): y and fill
Run `rlang::last_error()` to see where the error occurred.

I've also tried typing in the different columns as the y= and fill= as below:

code:

ggplot(plotting_variables_remove_na,aes(fill=c(SA_avg,LA_avg,AA_avg,PA_avg,
                                               IBA_avg,BA_avg,IVA_avg,VA_avg,
                                               OA_total_avg),
                                        y=c(SA_avg,LA_avg,AA_avg,PA_avg,IBA_avg,
                                            BA_avg,IVA_avg,VA_avg,
                                            OA_total_avg),x=treatment_group))+
  geom_bar()

output:

Error in `check_aesthetics()`:
! Aesthetics must be either length 1 or the same as the data (14): y and fill
Run `rlang::last_error()` to see where the error occurred.
1 Like

Just at a glance it looks like aes() is not written in a way it understands.

We don't have access to your .csv to verify however.

image

So that's part of why I put the output too. The dataframe is in the output after the "plotting_variables_remove_na" above. I'm not sure how to format the y= and x= to make the different _avg columns (as shown, like SA_avg, LA_avg, etc) as a column in each treatment group.

This is what I mean. Let me know if this works.

ggplot(plotting_variables_remove_na, aes(x = treatment_groups, y = numeric_variables, fill = numeric_variables)) +
  geom_bar()

Oh okay. I did try that and got the following error

> ggplot(plotting_variables_remove_na, aes(x = treatment_group, y = numeric_variables, fill = numeric_variables)) +
+   geom_bar()
Don't know how to automatically pick scale for object of type grouped_df/tbl_df/tbl/data.frame. Defaulting to continuous.
Don't know how to automatically pick scale for object of type grouped_df/tbl_df/tbl/data.frame. Defaulting to continuous.
Error in `check_aesthetics()`:
! Aesthetics must be either length 1 or the same as the data (14): y and fill
Run `rlang::last_error()` to see where the error occurred.

It looks like you're assigning a tables within your aesthetics, rather than the variables within the plotting_variables_remove_na table.

I've looked at this page and I'm not sure how what I typed creates a vector, as I am using the assignment of y to be the numeric_variables table, which says the length is 14 , as shown in the error below. But according to my environment the number of observations is 14. The website you shared shows that it saved a series of numbers as "y" to use, but isn't that the same thing I did with saving the columns of interest as numeric_variables? I'm really new to this so I'm trying to figure it out, but I'm not sure how what I'm doing is different.

Error in `check_aesthetics()`:
! Aesthetics must be either length 1 or the same as the data (14): y

image

You can think of it like a matrix; Based on your tables; the aes() is asking that you provide a 14 by 1 matrix, where you're providing a 14 by 11 matrix.

You're providing 11 variables where it only asks for 1.

The aes() is also referencing the table you've placed in ggplot() so that it knows which variables to use.

You may also want to consider using geom_col() rather than geom_bar() in this instance.

# All variables in aes() are found within the "plotting_variables_remove_na" table.
ggplot(plotting_variables_remove_na, aes(x = treatment_group, y = SA_avg)) + 
    geom_col()

Okay, I kinda get that. But then how do I get a grouped bar chart where each of those variables are graphed, grouped by the treatment group? So for example, for the following table I want the the initial biomass to have a bar for all the variables. And then the Initial biomass with CA to have a bar for all of the variables, and so on for each treatment group. Does this make sense?

> numeric_variables
# A tibble: 14 x 11
# Groups:   SA_avg, LA_avg, FA_avg, AA_avg, PA_avg, IBA_avg, BA_avg, IVA_avg, VA_avg, OA_total_avg [14]
   treatment_group         SA_avg LA_avg FA_avg AA_avg PA_avg IBA_avg BA_avg IVA_avg VA_avg OA_total_avg
   <chr>                    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>  <dbl>   <dbl>  <dbl>        <dbl>
 1 Initial biomass           0.23   0      0      0.02   0.05    0      0       0      0            0.3 
 2 Initial biomass with CA   0.22   0      0      0.01   0.04    0      0       0      0            0.27
 3 Aer_4hr                   2.34   0.02   0.03   0.05   0.7     0      0       0.01   0            3.15
 4 Aer_8hr                   2.85   0.04   0.05   0.1    0.95    0      0.01    0.02   0            4   
 5 Aer_24hr                  1.58   0.04   0.02   0.08   0.63    0      0.02    0      0            2.37
 6 Aer_28day                 0.68   0.15   0.34   5.02   3.27    0.25   4.27    0.97   1.32        16.3 
 7 Ana_4hr                   0.71   0      0.01   0.02   0.23    0      0       0      0            0.98
 8 Ana_8hr                   2.12   0.02   0.06   0.04   0.7     0      0.01    0      0            2.96
 9 Ana_24hr                  2.1    0.07   0.06   0.04   0.71    0      0.03    0      0            2.99
10 Ana_28day                 1.01   0.3    0.27   3.06   2.78    0.04   2.5     0.42   0           10.4 
11 CA_4hr                    2.05   0.14   0.05   0.01   0.68    0      0       0      0            2.93
12 CA_8hr                    1.92   0.31   0.06   0.01   0.78    0.01   0       0      0            3.09
13 CA_24hr                   1.79   0.28   0.04   0.02   0.62    0      0       0      0            2.76
14 CA_28day                 29.7    1.3    0.43   2.49   2.09    0      0.03    2.48   0           38.5 

So I've attached a picture of my csv, hopefully it shows here. Basically, what I want to do is create a column for the average of each respective column amongst each treatment group. Then I want to have a graph that has the treatment groups as the x-axis, and then the average for each of the acids as a different column. So within the Aer 4 hr group there will be a bar for the average of Succinic Acid, the average of Lactic Acid, etc. Then Aer 8 hr will have the same columns. Does this make sense? I'm not sure how to format the plotting information to use all of the average columns as they are all different variables. Or how to format the data.frame so that I can make all of these into an extra variable.

So I've attached a picture of my csv, hopefully it shows here. Basically, what I want to do is create a column for the average of each respective column amongst each treatment group. Then I want to have a graph that has the treatment groups as the x-axis, and then the average for each of the acids as a different column. So within the Aer 4 hr group there will be a bar for the average of Succinic Acid, the average of Lactic Acid, etc. Then Aer 8 hr will have the same columns. Does this make sense? I'm not sure how to format the plotting information to use all of the average columns as they are all different variables. Or how to format the data.frame so that I can make all of these into an extra variable.

the code I'm using:

# RSTUDIO PACKAGE INSTALLATION ################################################

library(janitor)
library(tidyverse)
library(ggplot2)
library(readr)
library(dplyr)

# IMPORT THE DATA #############################################################
organic_acids<-read.csv(file='OCT19_Queue_organic acid summary8.csv')


organic_acids
summary(organic_acids)

# FORMATTING SPREADSHEET COLUMN NAMES AND TYPES ###############################

colnames(organic_acids)<-gsub("X..","",colnames(organic_acids))

colnames(organic_acids)<-gsub("..db.","",colnames(organic_acids))

colnames(organic_acids)<-gsub(".Acid","_Acid",colnames(organic_acids))

colnames(organic_acids)<-gsub("ï..","",colnames(organic_acids))

organic_acids<-organic_acids %>% 
  rename("experiment_id"='Experiment.ID',
           'sample_name'='OA.sample.name',
         'treatment_group'='Sample_overview',
         'wet_biomass_extracted_g'='Wet.Biomass.extracted',
         'mc_content'='Moisture.content',
         'dry_biomass_g'='Dry.Biomass.extracted')

organic_acids<-organic_acids %>% 
  mutate(across(contains(c("mc_content","MC_avg","_Acid","A_avg","total_avg","A_stdev","total_stdev")),parse_number))

organic_acids
                                     
# summary(organic_acids)
# view(organic_acids)
# sapply(organic_acids,class)

# CREATING DATA PLOTS #########################################################

plotting_variables<-organic_acids %>% 
  select(treatment_group,SA_avg,LA_avg,FA_avg,AA_avg,PA_avg,IBA_avg,BA_avg,IVA_avg,VA_avg,
         OA_total_avg)

plotting_variables_remove_na<-na.omit(plotting_variables)

plotting_variables_remove_na



numeric_variables<-plotting_variables_remove_na %>% 
  select(SA_avg,LA_avg,FA_avg,AA_avg,PA_avg,IBA_avg,BA_avg,IVA_avg,VA_avg,
           OA_total_avg)

numeric_variables<-data.frame(x=unlist(numeric_variables))

numeric_variables

################################
plotting_variables_remove_na %>% 
  ggplot()+
  geom_col(aes(x=treatment_group,y=numeric_variables))

This is what my environment looks like for number of variables

image

And this is the error I'm getting:

> plotting_variables_remove_na %>% 
+   ggplot()+
+   geom_col(aes(x=treatment_group,y=numeric_variables))
Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
Error in is.finite(x) : default method not implemented for type 'list'

I know this a very roundabout way of getting this plot, but I've tried a lot of different things and can't get it to work. I'm very new to coding in general, especially R and I can't seem to figure it out. I tried unlisting the information in the numeric_variables to try to get it to just one variable to use the aes(). When I tried without the listing it was having problems as the y was then more than one variable. I'm just very confused. I don't know if I need to make a new data frame and merge something? Any help at all would be very helpful.

A screenshot is not very useful, can you please turn this into a proper REPRoducible EXample (reprex) illustrating your issue?

If you'd like to include all variables with respect to their treatment_group, then you'd either need to layer those individual variables within a plot or you would have to reshape your data into a longer format so that you can more easily call the variables you want.

Longer format example

Using iris dataset

Reshaping a (150 \times 5) table.

     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
  1:          5.1         3.5          1.4         0.2    setosa
  2:          4.9         3.0          1.4         0.2    setosa
  3:          4.7         3.2          1.3         0.2    setosa
  4:          4.6         3.1          1.5         0.2    setosa
  5:          5.0         3.6          1.4         0.2    setosa
 ---                                                            
146:          6.7         3.0          5.2         2.3 virginica
147:          6.3         2.5          5.0         1.9 virginica
148:          6.5         3.0          5.2         2.0 virginica
149:          6.2         3.4          5.4         2.3 virginica
150:          5.9         3.0          5.1         1.8 virginica

To a (600 \times 4) table.
The [Plant] variable is generated to define distinct records, as each row of the prior table is implied to belong to a single plant.

     Plant   Species Measure Type Measure
  1:     1    setosa Sepal.Length     5.1
  2:     1    setosa  Sepal.Width     3.5
  3:     1    setosa Petal.Length     1.4
  4:     1    setosa  Petal.Width     0.2
  5:     2    setosa Sepal.Length     4.9
 ---                                     
596:   149 virginica  Petal.Width     2.3
597:   150 virginica Sepal.Length     5.9
598:   150 virginica  Sepal.Width     3.0
599:   150 virginica Petal.Length     5.1
600:   150 virginica  Petal.Width     1.8

Sorry, I wasn't exactly sure how to do that, thank you for the link.

#>   treatment_group     Succinic.Acid    Lactic.Acid  Formic.Acid
#> 1         T0               0.23             0            0  
#> 2         CA T0            0.22             0            0  
#> 3         Aer 4 hr         2.97            0.03         0.04  
#> 4         Aer 4 hr         2.02            0.01         0.02  
#> 5         Aer 8 hr         2.62            0.05         0.05  
#> 6         Aer 8 hr         3.42            0.05         0.05  

I know I can do it in excel to make the graph, as shown below, but I would really like to learn it in R.

If your datasets aren't particularly large, then tidyr() is more than sufficient to get your desired outcome.

The idea is to reshape the information so that ggplot can understand the information you're giving it.

Setup

library(tidyr)
library(ggplot2)

df <- data.frame(
  index = c(1:6),
  treatment_group = c("T0", "CA T0", "Aer 4 hr", "Aer 4", "Aer 8 hr", "Aer 8 hr"),
  Succinic.Acid = c(.23, .22, 2.97, 2.02, 2.62, 3.42),
  Lactic.Acid = c(0, 0, .03, .01, .05, .05),
  Formic.Acid = c(0, 0, .04, .02, .05, .05)
)

df output (wide format)

  index treatment_group Succinic.Acid Lactic.Acid Formic.Acid
1     1              T0          0.23        0.00        0.00
2     2           CA T0          0.22        0.00        0.00
3     3        Aer 4 hr          2.97        0.03        0.04
4     4        Aer 4 hr          2.02        0.01        0.02
5     5        Aer 8 hr          2.62        0.05        0.05
6     6        Aer 8 hr          3.42        0.05        0.05

Reshape from wide to long format

df_1 <- df |>
  pivot_longer(cols = Succinic.Acid:Formic.Acid,
               names_to = "Type",
               values_to = "Value")

dt_1 (long format)

index is used to illustrate reshaping

# A tibble: 18 x 4
   index treatment_group Type          Value
   <int> <chr>           <chr>         <dbl>
 1     1 T0              Succinic.Acid  0.23
 2     1 T0              Lactic.Acid    0   
 3     1 T0              Formic.Acid    0   
 4     2 CA T0           Succinic.Acid  0.22
 5     2 CA T0           Lactic.Acid    0   
 6     2 CA T0           Formic.Acid    0   
 7     3 Aer 4 hr        Succinic.Acid  2.97
 8     3 Aer 4 hr        Lactic.Acid    0.03
 9     3 Aer 4 hr        Formic.Acid    0.04
10     4 Aer 4 hr        Succinic.Acid  2.02
11     4 Aer 4 hr        Lactic.Acid    0.01
12     4 Aer 4 hr        Formic.Acid    0.02
13     5 Aer 8 hr        Succinic.Acid  2.62
14     5 Aer 8 hr        Lactic.Acid    0.05
15     5 Aer 8 hr        Formic.Acid    0.05
16     6 Aer 8 hr        Succinic.Acid  3.42
17     6 Aer 8 hr        Lactic.Acid    0.05
18     6 Aer 8 hr        Formic.Acid    0.05

Plot (ggplot2)

ggplot(df_1, aes(x = treatment_group, y = Value, fill = Type)) +
  geom_bar(stat = "identity", position = position_dodge())

That worked, thank you

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.