How to make a comparative histogram of two different databases?

Hi everyone!!

I am trying to make a histogram using two different databases. What I want to see is the difference of bacteria communities between the two databases, and represent only (that appear in the graph) the communities that are greater than 0.01% of the total communities. In the following image you can see an example of what I would like to obtain.

These are my two databases that I want to compare:

bpdataSILVA <-data.frame (stringsAsFactors = tibble::tribble(
                                           ~index, ~Acidobacteria, ~Actinobacteria, ~Armatimonadetes, ~BRC1, ~Bacteroidetes, ~Chlamydiae, ~Chloroflexi,
                                             "1A",              0,             375,                0,     0,           5948,           0,            0,
                                             "1B",              0,              31,                0,     0,           4948,           0,            0,
                                             "1C",              0,              31,                0,     0,           1036,           0,            0,
                                             "1D",              0,             788,                0,     0,           1690,           3,            0,
                                             "1E",              0,            3778,                0,     0,           5390,          67,           22,
                                             "1F",              9,            9326,                0,     0,          11923,         150,          129,
                                             "1G",              0,             875,                0,     0,           3356,           0,            0,
                                             "1H",              4,             338,                0,     0,          12836,           5,           76,
                                             "2A",              0,            4456,                0,     0,          15445,          61,            0,
                                             "2B",              0,            1802,                0,     0,          10218,         208,          243,
                                             "2C",              0,            2093,                0,     0,          14982,          67,           37,
                                             "2D",              0,            1168,                0,     0,           3602,          60,          119,
                                             "2E",              0,            1999,                0,     0,           6141,           0,           25
                                           )
)

bpdataRDP <-data.frame (stringsAsFactors = tibble::tribble(
                                                ~index, ~Acidobacteria, ~Actinobacteria, ~Armatimonadetes, ~BRC1, ~Bacteroidetes, ~Saccharibacteria, ~Chlamydiae,
                                                  "1A",              0,             389,                0,     0,           5936,                 0,           0,
                                                  "1B",              0,              33,                0,     0,           4927,                 0,           0,
                                                  "1C",              0,              31,                0,     0,           1035,                 0,           0,
                                                  "1D",              0,             797,                0,     0,           1690,                 0,           3,
                                                  "1E",              0,            3864,                0,     0,           5390,                 4,          67,
                                                  "1F",              9,            9570,                0,     0,          10966,                 0,         148,
                                                  "1G",              0,             878,                0,     0,           3329,                 0,           0,
                                                  "1H",              4,             338,                0,     0,          12756,                 3,           5,
                                                  "2A",              0,            4450,                0,     0,          15453,                 0,          61,
                                                  "2B",              0,            1819,                0,     0,          10255,                 5,         208,
                                                  "2C",              0,            2093,                0,     0,          15262,                 0,          67,
                                                  "2D",              0,            1163,                0,     0,           3485,                 0,          60,
                                                  "2E",              0,            1988,                0,     0,           6123,                 0,           0
                                                )
 )

Created on 2020-02-03 by the reprex package (v0.3.0)

Thank you so much!!

Osiris

Please test your code before posting it, you should not have added the data.frame() part because it produces an error.

I'm not sure I understand your questions because you haven't defined how "abundance" should be calculated but this should take you very close at least
BTW, what you are showing is a Barplot not a histogram.

library(tidyverse)

bpdataSILVA <- tibble::tribble(
    ~index, ~Acidobacteria, ~Actinobacteria, ~Armatimonadetes, ~BRC1, ~Bacteroidetes, ~Chlamydiae, ~Chloroflexi,
    "1A",              0,             375,                0,     0,           5948,           0,            0,
    "1B",              0,              31,                0,     0,           4948,           0,            0,
    "1C",              0,              31,                0,     0,           1036,           0,            0,
    "1D",              0,             788,                0,     0,           1690,           3,            0,
    "1E",              0,            3778,                0,     0,           5390,          67,           22,
    "1F",              9,            9326,                0,     0,          11923,         150,          129,
    "1G",              0,             875,                0,     0,           3356,           0,            0,
    "1H",              4,             338,                0,     0,          12836,           5,           76,
    "2A",              0,            4456,                0,     0,          15445,          61,            0,
    "2B",              0,            1802,                0,     0,          10218,         208,          243,
    "2C",              0,            2093,                0,     0,          14982,          67,           37,
    "2D",              0,            1168,                0,     0,           3602,          60,          119,
    "2E",              0,            1999,                0,     0,           6141,           0,           25
)

bpdataRDP <- tibble::tribble(
    ~index, ~Acidobacteria, ~Actinobacteria, ~Armatimonadetes, ~BRC1, ~Bacteroidetes, ~Saccharibacteria, ~Chlamydiae,
    "1A",              0,             389,                0,     0,           5936,                 0,           0,
    "1B",              0,              33,                0,     0,           4927,                 0,           0,
    "1C",              0,              31,                0,     0,           1035,                 0,           0,
    "1D",              0,             797,                0,     0,           1690,                 0,           3,
    "1E",              0,            3864,                0,     0,           5390,                 4,          67,
    "1F",              9,            9570,                0,     0,          10966,                 0,         148,
    "1G",              0,             878,                0,     0,           3329,                 0,           0,
    "1H",              4,             338,                0,     0,          12756,                 3,           5,
    "2A",              0,            4450,                0,     0,          15453,                 0,          61,
    "2B",              0,            1819,                0,     0,          10255,                 5,         208,
    "2C",              0,            2093,                0,     0,          15262,                 0,          67,
    "2D",              0,            1163,                0,     0,           3485,                 0,          60,
    "2E",              0,            1988,                0,     0,           6123,                 0,           0
)

bind_rows(list("RDP" = bpdataRDP, "SILVA" = bpdataSILVA), .id = "bpdata") %>% 
    gather(Bacteria, Count, -c(bpdata, index)) %>% 
    group_by(bpdata, Bacteria) %>% 
    summarise(Count = sum(Count, na.rm = TRUE)) %>% 
    group_by(bpdata) %>% 
    mutate(Proportion = Count / sum(Count)) %>% 
    filter(Proportion >= 0.0001) %>% 
    ggplot(aes(x = bpdata, y = Proportion, fill = Bacteria)) +
    geom_col() +
    scale_y_continuous(labels = scales::percent_format())

Created on 2020-02-04 by the reprex package (v0.3.0.9001)

1 Like

Hi Andres, thanks for all your help! Just one more question, is there any way to set the percentage increases from 10 to 10?

You can specify the breaking points

bind_rows(list("RDP" = bpdataRDP, "SILVA" = bpdataSILVA), .id = "bpdata") %>% 
    gather(Bacteria, Count, -c(bpdata, index)) %>% 
    group_by(bpdata, Bacteria) %>% 
    summarise(Count = sum(Count, na.rm = TRUE)) %>% 
    group_by(bpdata) %>% 
    mutate(Proportion = Count / sum(Count)) %>% 
    filter(Proportion >= 0.0001) %>% 
    ggplot(aes(x = bpdata, y = Proportion, fill = Bacteria)) +
    geom_col() +
    scale_y_continuous(breaks = seq(0, 1, 0.1),
                       labels = scales::percent_format(accuracy = 1))

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.