making smarter function in R

help · October 1, 2022, 3:09am

full_function <- function(data) {
df1 <- data %>% 
mutate(new_variable = ifelse(years_in_love > 100, 99, years_in_love)) %>%
mutate(type = "love") %>%
count(new_variable, type)


df2 <- data %>% 
mutate(new_variable = ifelse(years_married > 100, 99, years_married)) %>%
mutate(type = "married") %>%
count(new_variable, type)

full_data <- bind_rows(df1, df2)

}

mexico_data <- full_function(mexico)
vietnam_data <- full_function(vietnam)

above is my code, I am wondering if there is ways to make it shorter or more generalizable? creating df1 and df2 in the function is very similar but only few things change and it does not seem to warrant that many lines of code.. I am hoping there may be quicker way to get my desired two output data frames, which is essentially just the difference in underlying data used . thank you so very much <3

FJCC · October 1, 2022, 5:02am

Here is one alternative that I named full_function2 and compared to your full_function.

In your ifelse functions, shouldn't the comparison be > 99? Otherwise, 100 remains 100 but 101 becomes 99.

library(tidyverse)
DF <- data.frame(years_in_love = c(10,45,13,45),
                 years_married = c(8,44,6,8))
full_function <- function(data) {
  df1 <- data %>% 
    mutate(new_variable = ifelse(years_in_love > 100, 99, years_in_love)) %>%
    mutate(type = "love") %>%
    count(new_variable, type)
  
  
  df2 <- data %>% 
    mutate(new_variable = ifelse(years_married > 100, 99, years_married)) %>%
    mutate(type = "married") %>%
    count(new_variable, type)
  
  full_data <- bind_rows(df1, df2)
  return(full_data)
}

full_function2 <- function(DAT) {
  DAT |> 
    mutate(married = ifelse(years_married > 100, 99, years_married),
           love = ifelse(years_in_love > 100, 99, years_in_love)) |> 
    select(married,love) |> 
    pivot_longer(cols = c("married","love"),names_to = "type",values_to = "value") |> 
    count(value,type)
}

full_function(DF)
#>   new_variable    type n
#> 1           10    love 1
#> 2           13    love 1
#> 3           45    love 2
#> 4            6 married 1
#> 5            8 married 2
#> 6           44 married 1
full_function2(DF)
#> # A tibble: 6 × 3
#>   value type        n
#>   <dbl> <chr>   <int>
#> 1     6 married     1
#> 2     8 married     2
#> 3    10 love        1
#> 4    13 love        1
#> 5    44 married     1
#> 6    45 love        2

^{Created on 2022-09-30 with reprex v2.0.2}

FactOREO · October 1, 2022, 7:59am

Here is another improvement, not regarding logic but speed:

Data <- data.frame(years_in_love = c(10,45,13,45),
                   years_married = c(8,44,6,8))
# your function
library(magrittr); library(dplyr)
full_function <- function(data) {
  df1 <- data %>% 
    mutate(new_variable = ifelse(years_in_love > 100, 99, years_in_love)) %>%
    mutate(type = "love") %>%
    count(new_variable, type)
  
  df2 <- data %>% 
    mutate(new_variable = ifelse(years_married > 100, 99, years_married)) %>%
    mutate(type = "married") %>%
    count(new_variable, type)
  
  bind_rows(df1, df2)
  # not necessary
  # return(full_data)
}
# function from @FJCC
 library(tidyr)
full_function2 <- function(DAT) {
  DAT |> 
    mutate(married = ifelse(years_married > 100, 99, years_married),
           love = ifelse(years_in_love > 100, 99, years_in_love)) |> 
    select(married,love) |> 
    pivot_longer(cols = c("married","love"),names_to = "type",values_to = "value") |> 
    count(value,type)
}

# using collapse and data.table instead of dplyr but same logic
library(collapse);library(data.table)
full_function3 <- function(DAT){
  DAT |>
    # convert to data.table
    qDT() |>
    fmutate(
      married = fifelse(years_married > 100, 99, years_married),
      love    = fifelse(years_in_love > 100, 99, years_in_love)
    ) |>
    melt.data.table(
      measure.vars = c('love','married'), variable.name = 'type', value.name = 'value'
    ) |>
    (\(x) x[, .N, by = c('type','value')])()
}
# equal?
full_function(Data)
#>   new_variable    type n
#> 1           10    love 1
#> 2           13    love 1
#> 3           45    love 2
#> 4            6 married 1
#> 5            8 married 2
#> 6           44 married 1
full_function2(Data)
#> # A tibble: 6 × 3
#>   value type        n
#>   <dbl> <chr>   <int>
#> 1     6 married     1
#> 2     8 married     2
#> 3    10 love        1
#> 4    13 love        1
#> 5    44 married     1
#> 6    45 love        2
full_function3(Data)
#>       type value N
#> 1:    love    10 1
#> 2:    love    45 2
#> 3:    love    13 1
#> 4: married     8 2
#> 5: married    44 1
#> 6: married     6 1
# I think so (except data.frame type)

# speed comparison
microbenchmark::microbenchmark(
  original = full_function(Data),
  improved = full_function2(Data),
  improved2= full_function3(Data),
  times = 100
)
#> Unit: milliseconds
#>       expr     min      lq      mean  median       uq     max neval
#>   original 25.5338 28.5682 32.774681 31.7347 34.64235 56.4048   100
#>   improved 19.7737 21.5917 23.830159 23.1554 25.12710 35.2497   100
#>  improved2  1.4976  1.7913  2.367522  2.2839  2.63385  9.6346   100

^{Created on 2022-10-01 by the reprex package (v2.0.1)}

Maybe this improves it further

Kind regards

system · October 22, 2022, 7:59am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.