multiple columns to wide format with dcast

I have data that looks like this:

structure(list(date_jmp = structure(c(2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 7L, 12L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 7L, 12L), .Label = c("", "4/16/2020", "4/17/2020", "4/22/2020", "4/26/2020", "4/30/2020", "5/21/2020", "5/4/2020", "5/5/2020", "5/7/2020", "5/8/2020", "6/2/2020"), class = "factor"), date = structure(c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("", "2020_04_16", "2020_04_17", "2020_04_22", "2020_04_26", "2020_04_30", "2020_05_04", "2020_05_05", "2020_05_07", "2020_05_08", "2020_05_21", "2020_06_02"), class = "factor"), days_since_begin = c(0L, 1L, 6L, 10L, 14L, 18L, 19L, 21L, 22L, 35L, 47L, 0L, 1L, 6L, 10L, 14L, 18L, 19L, 21L, 22L, 35L, 47L), pair_plant = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), plant = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), tag_colors = structure(c(8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("", "buff_red_blue", "buff_red_green", "buff_yellow_blue", "buff_yellow_green", "gray_red_blue", "gray_red_green", "gray_yellow_blue", "gray_yellow_green", "red", "yellow"), class = "factor"), sod_root = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", "root", "sod"), class = "factor"), cot_juv = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", "cot", "juv"), class = "factor"), condition = c(4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, NA, 4L, 4L, 4L, 5L, 4L, 4L, 4L, 5L, 5L, 5L, NA), condition_alive = c(4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, NA, 4L, 4L, 4L, 5L, 4L, 4L, 4L, 5L, 5L, 5L, NA), open_leaves = c(NA, NA, 1, 1, 2, NA, NA, 2, NA, 4, 5, NA, NA, 1, 1, 2, NA, NA, 2, NA, 3, 5), height = c(NA, NA, NA, 3.7, NA, NA, NA, 6.2, NA, 7.2, 10, NA, NA, NA, 2.6, NA, NA, NA, 4.2, NA, 4.8, 10), alive_1_dead_0 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), notes_initial_cond = structure(c(3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "1.8", "cracked leaf" ), class = "factor"), Date.collected = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "2020_04_12"), class = "factor"), Date.planted = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "2020_04_16"), class = "factor"), Near.far = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L), .Label = c("", "f", "n"), class = "factor"), condition_no_death = structure(c(7L, 7L, 8L, 1L, 8L, 8L, 8L, 8L, 8L, 1L, 1L, 7L, 7L, 7L, 1L, 7L, 7L, 7L, 8L, 8L, 1L, 1L), .Label = c("", " NA", " NA", "1", "2", "3", "4", "5" ), class = "factor")), row.names = c(NA, 22L), class = "data.frame")

I want to transform it to wide format. I want to make a new column for the height value for each day, a new column for the number of leaves each day, the condition, whether it's alive or dead, etc. It will be a lot of columns but I need it in this form to do a particular analysis. Posted below is an example of what I want done, but with only one column (open_leaves)

Here's the code that results in the following dataframe:

tps <- dcast(tpl, pair_plant + plant + tag_colors + sod_root + cot_juv ~ days_since_begin, value.var = "open_leaves" )

and the output:

screenshot including some notes for clarification:

This results with my data in the format that I want. So, I can do this with one variable. But I want to have more than just # of leaves. I want height (creating another 11 columns), condition (creating another 11 columns) and so on.

I'm sorry if I'm over-explaining. I just want to be really clear.

Somewhere I read that I might have to melt my data. I tried melting my data with

> tplmelt <- melt(tpl, id.vars = "plant")

and got the warning message "attributes are not identical across measure variables; they will be dropped". Also, I can replace the word "plant" with 3:8, or I can list out all of my i.d. variables and I get the same message. If I let R default to what it thinks are i.d. variables, it gets close to what I want but doesn't quite make it:

"Using date_jmp, date, tag_colors, sod_root, cot_juv, notes_initial_cond, Date.collected, Date.planted, Near.far, condition_no_death as id variables"

All of those except notes_initial_cond and condition_no_death are i.d. variables, but I also want plant and pair_plant as i.d. variables, everything else as measure variables...

Can you help me?

Hi @erinaiello,

Data are almost always easier to work with in long format. I think you should possibly reconsider converting this all into wide format. It means you are spreading a single variable/concept across multiple columns, which makes it much more difficult to work with. dplyr functions make working with long data much friendlier.

In any case, I think this achieves what you want:

library(dplyr)
library(tidyr)

x %>% 
  select(-date_jmp, -date) %>% # remove duplicate date info
  pivot_wider(
    names_from = days_since_begin,
    values_from = c(-pair_plant, -plant, -tag_colors, -days_since_begin)
  )
3 Likes

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

Hmm, I just ran it again without any errors on my end. I am using tidyr version 1.1.0. What version are you using?

Thank you! Yes, I normally work with data in long form but I can run a repeated measures more easily with short form.

The code worked but I got a warning message for 12 columns:

  • Values in column_name_here are not uniquely identified; output will contain list-cols.
  • Use values_fn = list(column_name_here) to suppress this warning.

I do so, and then I get this warning message:

  • Use values_fn = list(alive_1_dead_0 = list) to suppress this warning.
  • Use values_fn = list(alive_1_dead_0 = length) to identify where the duplicates arise
  • Use values_fn = list(alive_1_dead_0 = summary_fun) to summarise duplicates

Attempting to do so is leading to more warning messages.....