How to data wrangle in an automatized way

AbhishekHP · September 19, 2019, 7:29am

Please consider an reprex below:

Can we arrive at the expected output without manually creating intermediate column names via summarise and mutate ?

library(tidyverse)

# Input data
data = data.frame(
 season = c("SU","SU","WI","WI"),
 farmer = c("A","B","A","B"),
 "2013" = c(25,20,22,10),
 "2014" = c(56,40,100,90),
 "2015" = c(60,30,160,80),
 "2016" = c(90,60,120,50),
 "2017" = c(52,20,62,10),
 "2018" = c(5,2,29,10)
)

# Can we arrive at this output without manual naming the column names ?
# e.g. each summarise and mutate has a manually named column names
# Can we make it auto-generated ?
expected_output = data %>%
  gather(key = key, value = value,
         ("X2013":"X2018")) %>%
  mutate(farmer_year = paste(farmer,key),
         key = NULL,
        value = value %>% as.integer()) %>%
  spread(farmer_year, value) %>%
  group_by(season) %>%
  summarise(`A X2013` = sum(`A X2013` %>% as.integer(), na.rm = T),
            `B X2013` = sum(`B X2013` %>% as.integer(),na.rm = T),
            `A X2014` = sum(`A X2014` %>% as.integer(), na.rm = T),
            `B X2014` = sum(`B X2014`  %>% as.integer(), na.rm = T),
            `A X2015` = sum(`A X2015` %>% as.integer(), na.rm = T),
            `B X2015` = sum(`B X2015` %>% as.integer(), na.rm = T),
            `A X2016` = sum(`A X2016` %>% as.integer(), na.rm = T),
            `B X2016` = sum(`B X2016` %>% as.integer(), na.rm = T),
            `A X2017` = sum(`A X2017` %>% as.integer(), na.rm = T),
            `B X2017` = sum(`B X2017` %>% as.integer(), na.rm = T),
            `A X2018` = sum(`A X2018` %>% as.integer(), na.rm = T),
            `B X2018` = sum(`B X2018` %>% as.integer(), na.rm = T)) %>% 
mutate(ratio_2013 =  100*`B X2013`/`A X2013`,
         farmer_growth_2014 = 100*`A X2014`/`A X2013`,
         ratio_2014 =  100*`B X2014`/`A X2014`,
         ratio_diff_13_14 = ratio_2014 - ratio_2013,
         
         farmer_growth_2015 = 100*`A X2015`/`A X2014`,
         ratio_2015 = 100*`B X2015`/`A X2015`,
         ratio_diff_14_15 = ratio_2015 - ratio_2014,
         
         farmer_growth_2016 = 100*`A X2016`/`A X2015`,
         ratio_2016 = 100*`B X2016`/`A X2016`,
         ratio_diff_15_16 = ratio_2016 - ratio_2015,
         
         farmer_growth_2017 = 100*`A X2017`/`A X2016`,
         ratio_2017 = 100* `B X2017`/`A X2017`,
         ratio_diff_16_17 = ratio_2017 - ratio_2016,
         
         farmer_growth_2018 = 100*`A X2018`/`A X2017`,
         ratio_2018 = 100*`B X2018`/`A X2018`,
         ratio_diff_17_18 = ratio_2018 - ratio_2017)

Yarnabrina · September 19, 2019, 8:44am

Up to the summarise part, you can do it much easily:

dataset %>%
  pivot_longer(cols = matches(match = "^X"),
               names_to = "Year",
               names_prefix = "X",
               names_ptypes = list("Year" = integer()),
               values_to = "Observations",
               values_ptypes = list("Observations" = integer())) %>%
  unite(col = farmer_Year,
        farmer, Year) %>%
  pivot_wider(names_from = farmer_Year,
              values_from = Observations)

Will update with a reprex after mutate part.

Yarnabrina · September 19, 2019, 10:35am

@AbhishekHP, sorry, but I cannot do what you want. I did the following, which contains all information you want, but the layout of the table is different. I'll follow this thread to learn a new elegant trick, but I give up.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)

dataset <- data.frame(season = c("SU", "SU", "WI", "WI"),
                      farmer = c("A", "B", "A", "B"),
                      "2013" = c(25, 20, 22, 10),
                      "2014" = c(56, 40, 100, 90),
                      "2015" = c(60, 30, 160, 80),
                      "2016" = c(90, 60, 120, 50),
                      "2017" = c(52, 20, 62, 10),
                      "2018" = c(5, 2, 29, 10))

dataset %>%
  pivot_longer(cols = matches(match = "^X"),
               names_to = "Year",
               names_prefix = "X",
               names_ptypes = list("Year" = integer()),
               values_to = "Observations",
               values_ptypes = list("Observations" = integer())) %>%
  group_by(season, Year) %>%
  pivot_wider(names_from = farmer,
              values_from = Observations) %>%
  group_by(season) %>%
  mutate(ratio = 100 * B / A,
         farmer_growth = 100 * A / lag(x = A),
         ratio_diff = ratio - lag(x = ratio)) %>%
  ungroup()
#> # A tibble: 12 x 7
#>    season  Year     A     B ratio farmer_growth ratio_diff
#>    <fct>  <int> <int> <int> <dbl>         <dbl>      <dbl>
#>  1 SU      2013    25    20  80           NA         NA   
#>  2 SU      2014    56    40  71.4        224         -8.57
#>  3 SU      2015    60    30  50          107.       -21.4 
#>  4 SU      2016    90    60  66.7        150         16.7 
#>  5 SU      2017    52    20  38.5         57.8      -28.2 
#>  6 SU      2018     5     2  40            9.62       1.54
#>  7 WI      2013    22    10  45.5         NA         NA   
#>  8 WI      2014   100    90  90          455.        44.5 
#>  9 WI      2015   160    80  50          160        -40   
#> 10 WI      2016   120    50  41.7         75         -8.33
#> 11 WI      2017    62    10  16.1         51.7      -25.5 
#> 12 WI      2018    29    10  34.5         46.8       18.4

^{Created on 2019-09-19 by the reprex package (v0.3.0)}

AbhishekHP · September 19, 2019, 11:45am

Thanks for helping me.
I tried help feature in RStudio but
couldnt find pivot_longer or pivot_wider

mara · September 19, 2019, 11:55am

Do you have the most recent version of tidyr (1.0.0) installed?

AbhishekHP · September 19, 2019, 2:07pm

not able to unload tiddyr 0.8.1 even after using remove.packages("tidyr")

mara · September 19, 2019, 2:35pm

Make sure you're starting RStudio from a blank slate (see the link below, which explains how to ensure this)

Start a fresh R session. You should have no libraries attached, or objects in your environment. You don't need to remove previous versions of tidyr, it will be overwritten by running:

install.packages("tidyr")

AbhishekHP · September 19, 2019, 4:11pm

Thanks for the tip. Actually, I restarted but

AbhishekHP · September 19, 2019, 4:15pm

Yarnabrina · September 19, 2019, 4:17pm

I don't know much about this, but try opening RStudio as sudo. That at least shouldn't give you permission problems.

andresrcs · September 19, 2019, 4:24pm

An additional pivot_wider() takes you one step closer, just cosmetic details missing in the column names.

library(dplyr)
library(tidyr)


data %>%
    pivot_longer(cols = matches(match = "^X"),
                 names_to = "Year",
                 names_prefix = "X",
                 names_ptypes = list("Year" = integer()),
                 values_to = "Observations",
                 values_ptypes = list("Observations" = integer())) %>%
    group_by(season, Year) %>%
    pivot_wider(names_from = farmer,
                values_from = Observations) %>%
    group_by(season) %>%
    mutate(ratio = 100 * B / A,
           farmer_growth = 100 * A / lag(x = A),
           ratio_diff = ratio - lag(x = ratio)) %>%
    pivot_wider(names_from = c(Year), values_from = c(A, B, ratio, farmer_growth, ratio_diff)) %>%
    select_if(~!all(is.na(.))) %>% 
    ungroup()
#> # A tibble: 2 x 29
#>   season A_2013 A_2014 A_2015 A_2016 A_2017 A_2018 B_2013 B_2014 B_2015
#>   <fct>   <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
#> 1 SU         25     56     60     90     52      5     20     40     30
#> 2 WI         22    100    160    120     62     29     10     90     80
#> # … with 19 more variables: B_2016 <int>, B_2017 <int>, B_2018 <int>,
#> #   ratio_2013 <dbl>, ratio_2014 <dbl>, ratio_2015 <dbl>,
#> #   ratio_2016 <dbl>, ratio_2017 <dbl>, ratio_2018 <dbl>,
#> #   farmer_growth_2014 <dbl>, farmer_growth_2015 <dbl>,
#> #   farmer_growth_2016 <dbl>, farmer_growth_2017 <dbl>,
#> #   farmer_growth_2018 <dbl>, ratio_diff_2014 <dbl>,
#> #   ratio_diff_2015 <dbl>, ratio_diff_2016 <dbl>, ratio_diff_2017 <dbl>,
#> #   ratio_diff_2018 <dbl>

AbhishekHP · September 19, 2019, 4:27pm

Thanks @andresrcs and @Yarnabrina,
I had almost gave up but you helped me
Now, I got to somehow install tidyr new version

system · September 26, 2019, 4:27pm

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.