Problem reading dataframe columns

Hello everyone. I am trying to plot a dataframe wich contains two rows, a part of which can be seen in the attached screenshot using the following command


barplot(cbind(total.2022-01-10, total.2022-01-11, total.2022-01-04, total.2022-01-05, total.2022-01-12) ~ set, data = combined_wide, col = rainbow(5))

As can be seen in the code and in the screenshot, I get an error :

Error in cbind(total.2022 - 1 - 10, total.2022 - 1 - 11, total.2022 -  : 
  object 'total.2022' not found

This seems to occur due to the fact that R does not recognizes - as part of the name of the variable of
the dataframe.

Can someone please tell me how can I plot the datadrame in question?

Best regards.

Try changing the hyphens to dashes

Thanks for the answer. I have changed the hypens to dashes as per the following code

barplot(cbind(total.2022–01–10, total.2022–01–11, total.2022–01–04, total.2022–01–05, total.2022–01–12) ~ set, data = combined_wide, col = rainbow(5))

And the error has changed to

Error: unexpected input in "barplot(cbind(total.2022–"

I have saved the dataframe as a csv file, but I do not think that I can upload it.

The dataframe comes from casos_hosp_uci_def_sexo_edad_provres.csv, a dataframe which can be found here, in case you want to take a look at a similar situation.

I’m sorry I meant underscores. But I’ll also take a look at the CSV to confirm

Thanks again. Using underscores I arrive to

Error in cbind(total.2022_01_10, total.2022_01_11, total.2022_01_04, total.2022_01_05,  : 
  object 'total.2022_01_10' not found

The column names I see in the csv file from the source are

> colnames(dat)
[1] "provincia_iso" "sexo"          "grupo_edad"    "fecha"        
[5] "num_casos"     "num_hosp"      "num_uci"       "num_def"

so variables like total.2022_01_10 would be missing whether with - or _

Sorry, I forgot to mention that I used the following code to tackle the original data to sum the total number of infected people by day

dbDate=data.frame(fecha = unique(db$fecha), 
           total = sapply(split(db, f = db$fecha), function(x) {sum(x[['num_casos']])}))

This produces two fields fecha and total so the variable names being used are still not present.

Are you trying to produce something like this?

library(readr)
library(ggplot2)
db <- read_csv("/home/roc/Downloads/casos_hosp_uci_def_sexo_edad_provres.csv")
#> Rows: 1299030 Columns: 8
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (3): provincia_iso, sexo, grupo_edad
#> dbl  (4): num_casos, num_hosp, num_uci, num_def
#> date (1): fecha
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dbDate=data.frame(fecha = unique(db$fecha), 
                  total = sapply(split(db, f = db$fecha), function(x) {sum(x[['num_casos']])}))
p <- ggplot(dbDate,aes(fecha,total)) 
p + geom_point() + theme_minimal()

Created on 2022-10-15 by the reprex package (v2.0.1)

This is my whole code. I want to do a barplot of the days with the most infected and differentiate between the Group M and the group H


db<-read.csv(file = 'casos_hosp_uci_def_sexo_edad_provres.csv')

##############################################################

#Day of most infections
dbDate=data.frame(fecha = unique(db$fecha), 
           total = sapply(split(db, f = db$fecha), function(x) {sum(x[['num_casos']])}))

dbDateSort<-dbDate[order(dbDate$total, decreasing = T),]

#Most infections province A
dbSub <- db[db$provincia_iso == "A",]
dbSub<-na.omit(dbSub)

dbDateSub=data.frame(fecha = unique(dbSub$fecha), 
                  total = sapply(split(dbSub, f = dbSub$fecha), function(x) {sum(x[['num_casos']])}))

dbDateSortSub<-dbDateSub[order(dbDateSub$total, decreasing = T),]


#Most infections in province A for days of most infections nationwide
dbDateSubSortNacional <-dbDateSub[order(match(dbDateSub[,1],dbDateSort[,1])),]


#Most infections for group H in province A for days of most infections nationwide
dbSubH<- db[db$provincia_iso == "A" & db$sexo=="H",]
dbSubH<-na.omit(dbSubH)
dbDateSubH=data.frame(fecha = unique(dbSubH$fecha), 
                     total = sapply(split(dbSubH, f = dbSubH$fecha), function(x) {sum(x[['num_casos']])}))

dbDateSubHSortNacional <-dbDateSubH[order(match(dbDateSubH[,1],dbDateSort[,1])),]


#Most infections for group M in province A for days of most infections nationwide


dbSubM<- db[db$provincia_iso == "A" & db$sexo=="M",]
dbSubM<-na.omit(dbSubM)
dbDateSubM=data.frame(fecha = unique(dbSubM$fecha), 
                      total = sapply(split(dbSubM, f = dbSubM$fecha), function(x) {sum(x[['num_casos']])}))

dbDateSubMSortNacional <-dbDateSubM[order(match(dbDateSubM[,1],dbDateSort[,1])),]


#Attempt to plot both gropus together

dbDateSubHSortNacional$set <- 'H'
dbDateSubMSortNacional$set <- 'M'
combined <- rbind(dbDateSubHSortNacional, dbDateSubMSortNacional)
combined_wide <- reshape(combined, direction = 'wide', idvar = 'set', timevar = 'fecha')
barplot(cbind(total.2022_01_10, total.2022_01_11, total.2022_01_04, total.2022_01_05, total.2022_01_12) ~ set, data = combined_wide, col = rainbow(5))

I think I have not explained myself very well. I want a plot of five bars, each one with two colours, representing M and H.

OK. Two problems

  1. combined-wide should be combined_wide; generally don't use operator names like - + / * %%, etc
  2. combined_wide still has - separators
library(readr)
library(ggplot2)
db <- read_csv("/home/roc/Downloads/casos_hosp_uci_def_sexo_edad_provres.csv")
#> Rows: 1299030 Columns: 8
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (3): provincia_iso, sexo, grupo_edad
#> dbl  (4): num_casos, num_hosp, num_uci, num_def
#> date (1): fecha
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dbDate=data.frame(fecha = unique(db$fecha), 
                  total = sapply(split(db, f = db$fecha), function(x) {sum(x[['num_casos']])}))

dbDateSort<-dbDate[order(dbDate$total, decreasing = T),]

#Most infections province A
dbSub <- db[db$provincia_iso == "A",]
dbSub<-na.omit(dbSub)

dbDateSub=data.frame(fecha = unique(dbSub$fecha), 
                     total = sapply(split(dbSub, f = dbSub$fecha), function(x) {sum(x[['num_casos']])}))

dbDateSortSub<-dbDateSub[order(dbDateSub$total, decreasing = T),]


#Most infections in province A for days of most infections nationwide
dbDateSubSortNacional <-dbDateSub[order(match(dbDateSub[,1],dbDateSort[,1])),]


#Most infections for group H in province A for days of most infections nationwide
dbSubH<- db[db$provincia_iso == "A" & db$sexo=="H",]
dbSubH<-na.omit(dbSubH)
dbDateSubH=data.frame(fecha = unique(dbSubH$fecha), 
                      total = sapply(split(dbSubH, f = dbSubH$fecha), function(x) {sum(x[['num_casos']])}))

dbDateSubHSortNacional <-dbDateSubH[order(match(dbDateSubH[,1],dbDateSort[,1])),]


#Most infections for group M in province A for days of most infections nationwide


dbSubM<- db[db$provincia_iso == "A" & db$sexo=="M",]
dbSubM<-na.omit(dbSubM)
dbDateSubM=data.frame(fecha = unique(dbSubM$fecha), 
                      total = sapply(split(dbSubM, f = dbSubM$fecha), function(x) {sum(x[['num_casos']])}))

dbDateSubMSortNacional <-dbDateSubM[order(match(dbDateSubM[,1],dbDateSort[,1])),]


#Attempt to plot both gropus together

dbDateSubHSortNacional$set <- 'H'
dbDateSubMSortNacional$set <- 'M'
combined <- rbind(dbDateSubHSortNacional, dbDateSubMSortNacional)

# change name from combined-wide
combined_wide <- reshape(combined, direction = 'wide', idvar = 'set', timevar = 'fecha')

# change variable names from hypen to underscore
under_scored <- gsub("-","_",colnames(combined_wide))

colnames(combined_wide) <- under_scored

barplot(cbind(total.2022_01_10, total.2022_01_11, total.2022_01_04, total.2022_01_05, total.2022_01_12) ~ set, data = combined_wide, col = rainbow(5))

I see. Thanks for the info.

You might want to consider using {ggalluvial}

Thanks for the info.

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.