geom_point with 2 categorical variables

Problem

Plotting data with 2 categorical variables, but I want to visualize all the data points for each unique combination of categories. Usually, people do something like:

g = ggplot(...) + 
  geom_point(
    position = position_jitter(...)
  )

but the result is a) ugly, and b) unreliable. I propose a solution:

Solution

Essentially it is a new "position" called position_bunch, which distributes the points at each unique (X,Y) according to a pattern. It can be used like:

g = ggplot(...) +
  geom_point(
    position = position_bunch(
      shape = 'hex',
      width = .7,
      sort  = TRUE,
    )
  )

yielding stuff like (jitter for comparison):

Implementation

 position_bunch = function(shape='hex',width=0.5,sort=1) {
   if (shape == 'hex') {
     n.layer.fun = n.layer.hex
     delta.fun   = delta.hex
   }
   if (shape == 'square') {
     n.layer.fun = n.layer.square
     delta.fun   = delta.square
   }
   if (shape == 'spiral') {
     n.layer.fun = n.layer.spiral
     delta.fun   = delta.spiral
   }
   if (sort) {
     sort.fun = sorting.fun
   } else {
     sort.fun = identity
   }
   cols = c('x','y')
   return(ggproto('PositionBunch',Position,
   required_aes = cols,
    compute_layer = function(self,data,params,layout) {
      select = function(x,y) {
        return((data$x==x) & (data$y==y))
      }
      u = unique(data[,cols])
      n = mapply(function(x,y) {sum(select(x,y))},u$x,u$y)
      l.max = n.layer.fun(max(n))
      delta = sort.fun(delta.fun(l.max),1)
      for (i in 1:nrow(u)) {
        rows = select(u$x[i],u$y[i])
        delta.i = sort.fun(delta[1:sum(rows),cols],sort) * (width/2/l.max)
        data[rows,cols] = data[rows,cols] + delta.i
      }
      return(data)
    })
  )
}
sorting.fun = function(delta,dir) {
  distance = apply(delta,1,function(d){sum(d^2)})
  return(delta[order(distance,decreasing=(dir==-1)),])
}
# -----------------------------------------------------------------------------
# hex
n.layer.hex = function(n) {
  return(floor(1+(-3+sqrt(9+12*(n-1)))/6))
}
delta.hex = function(layers) {
  yv = sqrt(3)/2; yh = 0; xv = 0.5; xh = 1;
  rep.steps = function(steps,layer,dim) {
    steps = rep(steps,each=layer)
    steps[1] = steps[1] + xv*(dim=='x') - yv*(dim=='y')
    return(steps)
  }
  dx = 0; dy = 0;
  for (layer in 1:layers) {
    dx = c(dx,rep.steps(c(+xv,-xv,-xh,-xv,+xv,+xh),layer,dim='x'))
    dy = c(dy,rep.steps(c(+yv,+yv, yh,-yv,-yv, yh),layer,dim='y'))
  }
  return(data.frame(x=cumsum(dx),y=cumsum(dy)))
}
# -----------------------------------------------------------------------------
# square
n.layer.square = function(n) {
  return(floor(1+(-2+sqrt(4+8*(n-1)))/4))
}
delta.square = function(layers) {
  yv = 1; yh = 0; xv = 0; xh = 1;
  rep.steps = function(steps,layer,dim) {
    steps = rep(steps,each=2*layer)
    steps[1] = steps[1] + xh*(dim=='x') - yv*(dim=='y')
    return(steps)
  }
  dx = 0; dy = 0;
  for (layer in 1:layers) {
    dx = c(dx,rep.steps(c( xv,-xh, xv,+xh),layer,dim='x'))
    dy = c(dy,rep.steps(c(+yv, yh,-yv, yh),layer,dim='y'))
  }
  return(data.frame(x=cumsum(dx),y=cumsum(dy)))
}
# -----------------------------------------------------------------------------
# spiral
f.spiral = pi*(1+sqrt(5))
n.layer.spiral = function(n) {
  return(ceiling(n/f.spiral))
}
delta.spiral = function(layers){
  i  = 0:ceiling(layers*f.spiral)
  r  = layers/2*sqrt(i/layers)
  t  = pi*(1+sqrt(5))*i
  dx = r * cos(t)
  dy = r * sin(t)
  return(data.frame(x=cumsum(dx),y=cumsum(dy)))
}

Test Code

library('ggplot2')
library('gridExtra')
library('viridis')
source('ggpositions.r')
set.seed(1234)

w = .7
g.list = list()
for (N in c(10,100,500)){
  data = data.frame(
    x = factor(floor(runif(N,1,3+1)),labels=c('A','B','C')),
    y = factor(floor(runif(N,1,3+1))),
    z = rev(sort(runif(N,1,N)))
  )
  for (shape in c('jitter','hex','square','spiral')){
    if (shape == 'jitter'){
      pos = position_jitter(width=w/4,height=w/4)
    } else {
      pos =  position_bunch(
        shape = shape,
        width = w,
      )
    }
    g = ggplot(data,aes(x=x,y=y,color=z)) +
      geom_point(position=pos,size=sqrt(2)/log10(N)) +
      scale_color_viridis() +
      xlab(NULL) + ylab(NULL) +
      theme(legend.position='none')
    if (N==10){ g = g + ggtitle(shape) }
    g.list[[length(g.list)+1]] = g
  }
}
G = do.call(arrangeGrob,c(g.list,list(nrow=3)))
ggsave('test.png',G,width=8,height=6)

Notes

  • It's a work in progress, so feedback welcome! What do you think?

  • I've only tested it with geom_point using aes(x= ,y= ) so far

  • Point sizes are hard to scale reliably, so you may have to tinker manually

  • After cleaning & testing, I plan to upload to the ggplot2 extensions library

2 Likes

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.