removing NA from used column in ggplot

Hi!
I have a data set with more than 17000 observations which contain many NAs. I compare different values from different columns with each other.
Although ggplot removes the incomplete rows, ggplot2 writes "NA" into the legend. For that reason, I'm trying to remove the NAs from my geom_point, but only for the values I'm using in the specific graph. I tried it with "na.rm", "na.omit", "!is.na", "complete.cases", and with "filter", but nothing worked until now.

Here is my code:

library(ggplot2)
library(dplyr)
library(tidyverse)

#sample data
Vergleich <- data.frame(
  mean_hh_wohnbdl = c(6,6,3,3,3,1,9,1,7,2,4,9,3,3,9),
  gleich_entf_fakt = c(16.00,66.67,50.00,14.29,42.86,17.14,33.33,116.67,200.00,20.00,66.67,40.00,70.00,NA,33.33)
  )
Vergleich <- mutate(Vergleich,gleich_entf_kat = cut(gleich_entf_fakt,breaks =  c(0,30,50,70,90,110,130,150,170,200000),labels = c("5","4","3","2","1","2","3","4","5")))

#geom_point 
ggplot(Vergleich, aes(x=as.numeric(mean_hh_wohnbdl),y=as.numeric(gleich_entf_fakt))) + 
  geom_point(aes(color = factor(gleich_entf_kat)),na.rm=TRUE) + 
  scale_color_manual(values=c("red4","red3","orange","green3","green4"),labels=c("sehr schlecht","schlecht","mäßig","gut","sehr gut")) +
  scale_x_continuous(breaks = 1:9,name="Bundesland",labels = c("B","K","NÖ","OÖ","S","ST","T","V","W")) + 
  scale_fill_manual(breaks = 1:9,name="Bundesland",labels = c("B","K","NÖ","OÖ","S","ST","T","V","W")) + 
  labs(title = "Übereinstimmung nach Bundesland",
       x = "Bundesland", 
       y = "Übereinstimmungsgrad",
       color = "gleich_entf_kat") +
  theme(plot.title = element_text(hjust = 0.5,face = "bold")) + 
  ggsave("04.01-Uebereinstimmung_Bundesland.png")

And my graph with NA in the legend:

Hope I was clear enough by explaining my problem.
Thanks in advance.

1 Like

Try piping into the drop_na() function prior to plotting, e.g.:

my_data %>%
  drop_na %>%
  ggplot(aes(x = x, y = y)) +
  geom_point()

or for specific variables:

my_data %>%
  drop_na(my_var) %>%
  ggplot(aes(x = x, y = y)) +
  geom_point()
2 Likes

Thank you for your reply.
Because my data contains NAs in other columns as well, I cannot use the first function - no data would remain after using that function. Can I use the function (drop_na(my_var)) for specific variables also for more than one variable?

You only need the drop_na for the variable used in the color though you can put more than one variable in it if you need it for more variables, e.g. drop_na(var1, var2)

library(tidyverse)

#sample data
Vergleich <- data.frame(
  mean_hh_wohnbdl = c(6,6,3,3,3,1,9,1,7,2,4,9,3,3,9),
  gleich_entf_fakt = c(16.00,66.67,50.00,14.29,42.86,17.14,33.33,116.67,200.00,20.00,66.67,40.00,70.00,NA,33.33)
)
Vergleich <- mutate(Vergleich,gleich_entf_kat = cut(gleich_entf_fakt,breaks =  c(0,30,50,70,90,110,130,150,170,200000),labels = c("5","4","3","2","1","2","3","4","5")))

#geom_point 
Vergleich %>%
  drop_na(gleich_entf_kat, mean_hh_wohnbdl) %>%
  ggplot(aes(x=as.numeric(mean_hh_wohnbdl),y=as.numeric(gleich_entf_fakt))) + 
  geom_point(aes(color = factor(gleich_entf_kat)),na.rm=TRUE) + 
  scale_color_manual(values=c("red4","red3","orange","green3","green4"),labels=c("sehr schlecht","schlecht","mäßig","gut","sehr gut")) +
  scale_x_continuous(breaks = 1:9,name="Bundesland",labels = c("B","K","NÖ","OÖ","S","ST","T","V","W")) + 
  scale_fill_manual(breaks = 1:9,name="Bundesland",labels = c("B","K","NÖ","OÖ","S","ST","T","V","W")) + 
  labs(title = "Übereinstimmung nach Bundesland",
       x = "Bundesland", 
       y = "Übereinstimmungsgrad",
       color = "gleich_entf_kat") +
  theme(plot.title = element_text(hjust = 0.5,face = "bold"))

Created on 2020-08-16 by the reprex package (v0.3.0)

3 Likes

Thanks a lot, now it's working.

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.