Simplify and make the if statement more efficient?

I have this for loop that runs through my data frame, looking at each of 8 variables, and if any of them is greater than "max_percent_missing, it sets a "lazy_case" variable to 1. I have tried to put the variable list into a list, or other variable and tried to run the logical test against it, but to no avail. Is it possible to just do this with a single if condition against all the variable.

for(i in 1:nrow(dataframe)) {
  if(dataframe$lazy_a[i] > max_percent_missing |
     dataframe$lazy_b[i] > max_percent_missing | 
     dataframe$lazy_c[i] > max_percent_missing |
     dataframe$lazy_d[i] > max_percent_missing |
     dataframe$lazy_e[i] > max_percent_missing |
     dataframe$lazy_f[i] > max_percent_missing |
     dataframe$lazy_g[i] > max_percent_missing |
     dataframe$lazy_h[i] > max_percent_missing)
    {
    dataframe$lazy_case[i] <- 1
  } 
}

Below is one approach that uses the pivot_longer() function to reshape the data prior to comparing each value to max_pct_missing.

library(tidyverse)

set.seed(4)

# sample data
df = data.frame(
  row = 1:5,
  lazy_a = runif(5),
  lazy_b = runif(5),
  lazy_c = runif(5)
) |>
  mutate(max_pct_missing = 0.9)

df
#>   row      lazy_a     lazy_b    lazy_c max_pct_missing
#> 1   1 0.585800305 0.26042777 0.7546750             0.9
#> 2   2 0.008945796 0.72440589 0.2860006             0.9
#> 3   3 0.293739612 0.90609215 0.1000535             0.9
#> 4   4 0.277374958 0.94904022 0.9540688             0.9
#> 5   5 0.813574215 0.07314447 0.4156071             0.9
  
# check for row numbers with at least one value greater than max_pct_missing
check = df |>
  pivot_longer(cols = c(-'row', -'max_pct_missing')) |>
  filter(value > max_pct_missing) |>
  distinct(row, lazy_case = 1)

check
#> # A tibble: 2 × 2
#>     row lazy_case
#>   <int>     <dbl>
#> 1     3         1
#> 2     4         1

# final output
left_join(df, check)
#> Joining with `by = join_by(row)`
#>   row      lazy_a     lazy_b    lazy_c max_pct_missing lazy_case
#> 1   1 0.585800305 0.26042777 0.7546750             0.9        NA
#> 2   2 0.008945796 0.72440589 0.2860006             0.9        NA
#> 3   3 0.293739612 0.90609215 0.1000535             0.9         1
#> 4   4 0.277374958 0.94904022 0.9540688             0.9         1
#> 5   5 0.813574215 0.07314447 0.4156071             0.9        NA

Created on 2023-03-28 with reprex v2.0.2.9000

1 Like

Good practice to provide data with a reprex. (See the FAQ).

When in doubt, abstract away the common elements.

When modifying all-numeric data, take advantage of matrix algebra.

trim_to <- function(x,y) {
  wide = dim(x)[1]
  long = dim(x)[2]
  size = wide * long
  m = as.matrix(x,nrow = wide)
  n = matrix(rep(y,size),nrow = wide)
  m[which(m > n)] = 1
  return(m)
}

d <- data.frame(
  lazy_a = c(0.24, 0.66, 0.09, 0.08, 0.9, 0.18,0.88, 0.19),
  lazy_b = c(0.13, 0.51, 0.5, 0.92, 0.21, 0.59, 0.24,0.13), 
  lazy_c = c(0.22, 0.42, 0.51, 0.61, 0.1, 0.99, 0.77, 0.65), 
  lazy_d = c(0.18, 0.47, 0.44, 0.8, 0.03, 0.58, 0.01, 0.86),
  lazy_e = c(0.77, 0.69, 0.97, 0.95, 0.06, 0.25, 0.82, 0.57), 
  lazy_f = c(0.83, 0.75, 0.63, 0.08, 0.96, 0.09, 0.66, 0.41), 
  lazy_g = c(0.9, 0.54, 0.27, 0.97, 0.13, 0.21, 0.86, 0.1), 
  lazy_h = c(0.59, 0.74, 0.25, 0.84, 0.57, 0.44, 0.27, 0.4))

d
#>   lazy_a lazy_b lazy_c lazy_d lazy_e lazy_f lazy_g lazy_h
#> 1   0.24   0.13   0.22   0.18   0.77   0.83   0.90   0.59
#> 2   0.66   0.51   0.42   0.47   0.69   0.75   0.54   0.74
#> 3   0.09   0.50   0.51   0.44   0.97   0.63   0.27   0.25
#> 4   0.08   0.92   0.61   0.80   0.95   0.08   0.97   0.84
#> 5   0.90   0.21   0.10   0.03   0.06   0.96   0.13   0.57
#> 6   0.18   0.59   0.99   0.58   0.25   0.09   0.21   0.44
#> 7   0.88   0.24   0.77   0.01   0.82   0.66   0.86   0.27
#> 8   0.19   0.13   0.65   0.86   0.57   0.41   0.10   0.40
(d <- trim_to(d,0.75))
#>      lazy_a lazy_b lazy_c lazy_d lazy_e lazy_f lazy_g lazy_h
#> [1,]   0.24   0.13   0.22   0.18   1.00   1.00   1.00   0.59
#> [2,]   0.66   0.51   0.42   0.47   0.69   0.75   0.54   0.74
#> [3,]   0.09   0.50   0.51   0.44   1.00   0.63   0.27   0.25
#> [4,]   0.08   1.00   0.61   1.00   1.00   0.08   1.00   1.00
#> [5,]   1.00   0.21   0.10   0.03   0.06   1.00   0.13   0.57
#> [6,]   0.18   0.59   1.00   0.58   0.25   0.09   0.21   0.44
#> [7,]   1.00   0.24   1.00   0.01   1.00   0.66   1.00   0.27
#> [8,]   0.19   0.13   0.65   1.00   0.57   0.41   0.10   0.40

Created on 2023-03-28 with reprex v2.0.2

1 Like

Try something like this.

library(tidyverse)
dataframe <- dateframe  |> mutate(bigger  = if_any(lazy_a:lazy_h, ~ .x > max_percent_missing))
dataframe <- dataframe(mutate(lazy_case = if_else(bigger, 1, lazy_case)
1 Like

This is gorgeous! Another thought, say I had a screener variable that would negate the value in variable lazy_h? Do I need to keep that separate (and before) and stop your code at lazy_g as below? if_any() or across() do not appear to allow multiple logical tests.

library(tidyverse)
for(i in 1:nrow(dataframe)) {
  if(dataframe$lazy_h_screener[i] == 2 && 
     dataframe$lazy_h[i] > max_percent_missing) {
    dataframe$lazy_case[i] <- 1
  }
}

dataframe <- dateframe  |> mutate(bigger  = if_any(lazy_a:lazy_g, ~ .x > max_percent_missing))
dataframe <- dataframe |> mutate(lazy_case = if_else(bigger, 1, lazy_case)
dataframe <- dateframe  |> 
  mutate(bigger  = if_any(lazy_a:lazy_g, ~ .x > max_percent_missing) |
                   if_any(lazy_h,        ~ .x > max_percent_missing & lazy_h_screener==2)
         )

I think this would be easier.

1 Like

AHH! Ok. I should just try things rather than to give up if I don't see it in a vignette.

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.