Filter a dataframe based on a cut-off value in R

mtoufiq · June 1, 2022, 9:01pm

Hi,

I am trying to filter a data.frame containing values across all columns in R followed by printing (subsetting the dataframe) and counting the values passing the filtering cut-off. Basically, I am interested in filtering, subsetting dataframe <= 0.9 and then count no. of values across all columns passing the cut-off. I have provided the example datasets and function.

dput(Data)
structure(list(`S 1` = c(0.883643926, 0.248614376, 0.518091486, 
                            0.535221236, 0.415450436, -0.940323826, -0.723796576, -0.824290276, 
                            NA, -0.806255146, -0.747521326, NA, 3.20247786, 1.10402434, 1.005757776
), `S 2` = c(1.005757776, 1.005757776, 4.51601548, 3, 7.78620408, 
                -0.706674058, -0.572657338, -0.686018538, -0.514713298, -0.532390248, 
                -0.462136378, -0.512892468, 1, 1.5, 2.5), `S 3` = c(7.798089, 
                                                                       9.2058061, 5.5408169, 1.52159119, 2.63042701, NA, 1.3857699, 
                                                                       -0.152939869, -0.050295909, -0.337659179, -0.058902499, -0.072916919, 
                                                                       -0.410700949, -0.079817359, -0.313859499), `S 4` = c(1.41324408, 
                                                                                                                               9.6038562, 1.71087962, 2.95921938, 4.82199712, 3.17140358, 1.15931318, 
                                                                                                                               NA, 1.58997338, 4.76858598, NA, -0.002674678, -0.235496858, 0.065630452, 
                                                                                                                               -0.175745228), `S 5` = c(-0.167945369, 1.41324408, 1.41324408, 
                                                                                                                                                           0.741171721, 2.494610191, -0.532343489, -0.358607189, -0.442774239, 
                                                                                                                                                           -0.103589789, 0.213156301, -0.022826199, -0.096645979, 1.215920941, 
                                                                                                                                                           3.377354481, 0.033402621)), class = "data.frame", row.names = c("Entity_1", 
                                                                                                                                                                                                                           "Entity_2", "Entity_3", "Entity_4", "Entity_5", "Entity_6", 
                                                                                                                                                                                                                           "Entity_7", "Entity_8", "Entity_9", "Entity_10", "Entity_11", 
                                                                                                                                                                                                                           "Entity_12", "Entity_13", "Entity_14", "Entity_15"))
#>                  S 1        S 2         S 3          S 4         S 5
#> Entity_1   0.8836439  1.0057578  7.79808900  1.413244080 -0.16794537
#> Entity_2   0.2486144  1.0057578  9.20580610  9.603856200  1.41324408
#> Entity_3   0.5180915  4.5160155  5.54081690  1.710879620  1.41324408
#> Entity_4   0.5352212  3.0000000  1.52159119  2.959219380  0.74117172
#> Entity_5   0.4154504  7.7862041  2.63042701  4.821997120  2.49461019
#> Entity_6  -0.9403238 -0.7066741          NA  3.171403580 -0.53234349
#> Entity_7  -0.7237966 -0.5726573  1.38576990  1.159313180 -0.35860719
#> Entity_8  -0.8242903 -0.6860185 -0.15293987           NA -0.44277424
#> Entity_9          NA -0.5147133 -0.05029591  1.589973380 -0.10358979
#> Entity_10 -0.8062551 -0.5323902 -0.33765918  4.768585980  0.21315630
#> Entity_11 -0.7475213 -0.4621364 -0.05890250           NA -0.02282620
#> Entity_12         NA -0.5128925 -0.07291692 -0.002674678 -0.09664598
#> Entity_13  3.2024779  1.0000000 -0.41070095 -0.235496858  1.21592094
#> Entity_14  1.1040243  1.5000000 -0.07981736  0.065630452  3.37735448
#> Entity_15  1.0057578  2.5000000 -0.31385950 -0.175745228  0.03340262

my_filter <- function(xx) {
  xx <- xx[!is.na(xx)] 
  sum(ifelse((xx <= 0.9), TRUE, FALSE))
}

# Apply on all columns with "S" in the column name
library(tidyverse)
Data %>% 
  summarise(across(contains("S"), my_filter)) %>% 
  as.data.frame(row.names="counts") -> df_filter

dput(df_filter)
structure(list(`S 1` = 10L, `S 2` = 7L, `S 3` = 8L, `S 4` = 4L, 
               `S 5` = 10L), class = "data.frame", row.names = "counts")
#>        S 1 S 2 S 3 S 4 S 5
#> counts  10   7   8   4  10

# Print/subset all the values from the table passing the cut-off
What function or formula to be used for printing/saving the values..

^{Created on 2022-06-02 by the reprex package (v2.0.1)}

Thank you,

Toufiq

andresrcs · June 1, 2022, 9:38pm

Is this what you mean?

library(dplyr)

sample_data <- data.frame(
  check.names = FALSE,
    row.names = c("Entity_1","Entity_2","Entity_3",
                  "Entity_4","Entity_5","Entity_6","Entity_7","Entity_8",
                  "Entity_9","Entity_10","Entity_11","Entity_12","Entity_13",
                  "Entity_14","Entity_15"),
        `S 1` = c(0.883643926,0.248614376,0.518091486,
                  0.535221236,0.415450436,-0.940323826,-0.723796576,
                  -0.824290276,NA,-0.806255146,-0.747521326,NA,3.20247786,1.10402434,
                  1.005757776),
        `S 2` = c(1.005757776,1.005757776,4.51601548,3,
                  7.78620408,-0.706674058,-0.572657338,-0.686018538,
                  -0.514713298,-0.532390248,-0.462136378,-0.512892468,1,1.5,2.5),
        `S 3` = c(7.798089,9.2058061,5.5408169,
                  1.52159119,2.63042701,NA,1.3857699,-0.152939869,-0.050295909,
                  -0.337659179,-0.058902499,-0.072916919,-0.410700949,
                  -0.079817359,-0.313859499),
        `S 4` = c(1.41324408,9.6038562,1.71087962,
                  2.95921938,4.82199712,3.17140358,1.15931318,NA,1.58997338,
                  4.76858598,NA,-0.002674678,-0.235496858,0.065630452,
                  -0.175745228),
        `S 5` = c(-0.167945369,1.41324408,1.41324408,
                  0.741171721,2.494610191,-0.532343489,-0.358607189,
                  -0.442774239,-0.103589789,0.213156301,-0.022826199,-0.096645979,
                  1.215920941,3.377354481,0.033402621)
)

sample_data %>% 
    mutate(across(.fns = ~ . <= 0.9)) %>% 
    summarise(across(.fns = sum, na.rm = TRUE))
#>   S 1 S 2 S 3 S 4 S 5
#> 1  10   7   8   4  10

sample_data %>% 
    mutate(across(.fns = ~ if_else(. <= 0.9, ., NA_real_)))
#>                  S 1        S 2         S 3          S 4         S 5
#> Entity_1   0.8836439         NA          NA           NA -0.16794537
#> Entity_2   0.2486144         NA          NA           NA          NA
#> Entity_3   0.5180915         NA          NA           NA          NA
#> Entity_4   0.5352212         NA          NA           NA  0.74117172
#> Entity_5   0.4154504         NA          NA           NA          NA
#> Entity_6  -0.9403238 -0.7066741          NA           NA -0.53234349
#> Entity_7  -0.7237966 -0.5726573          NA           NA -0.35860719
#> Entity_8  -0.8242903 -0.6860185 -0.15293987           NA -0.44277424
#> Entity_9          NA -0.5147133 -0.05029591           NA -0.10358979
#> Entity_10 -0.8062551 -0.5323902 -0.33765918           NA  0.21315630
#> Entity_11 -0.7475213 -0.4621364 -0.05890250           NA -0.02282620
#> Entity_12         NA -0.5128925 -0.07291692 -0.002674678 -0.09664598
#> Entity_13         NA         NA -0.41070095 -0.235496858          NA
#> Entity_14         NA         NA -0.07981736  0.065630452          NA
#> Entity_15         NA         NA -0.31385950 -0.175745228  0.03340262

^{Created on 2022-06-01 by the reprex package (v2.0.1)}

mtoufiq · June 1, 2022, 10:06pm

@andresrcs excellent, thank you. Additionally, it it possible to print a table with TRUE or FALSE passing the filter cut off.

andresrcs · June 1, 2022, 10:35pm

Yes, just remove the summarize() step

sample_data %>% 
    mutate(across(.fns = ~ . <= 0.9))

mtoufiq · June 2, 2022, 7:13pm

@andresrcs

Thank you very much. This is the expected output.

In case, If I would like to calculate and consider the entities with a cut off of <= 0.9 in at least 3 out of 5 columns (S1 to S5). In which part of the function can I input 3/5 (basically, I would like to obtain 60% entities passing this cut-off value).

andresrcs · June 4, 2022, 1:26am

library(tidyverse)

sample_data <- data.frame(
    check.names = FALSE,
    row.names = c("Entity_1","Entity_2","Entity_3",
                  "Entity_4","Entity_5","Entity_6","Entity_7","Entity_8",
                  "Entity_9","Entity_10","Entity_11","Entity_12","Entity_13",
                  "Entity_14","Entity_15"),
    `S 1` = c(0.883643926,0.248614376,0.518091486,
              0.535221236,0.415450436,-0.940323826,-0.723796576,
              -0.824290276,NA,-0.806255146,-0.747521326,NA,3.20247786,1.10402434,
              1.005757776),
    `S 2` = c(1.005757776,1.005757776,4.51601548,3,
              7.78620408,-0.706674058,-0.572657338,-0.686018538,
              -0.514713298,-0.532390248,-0.462136378,-0.512892468,1,1.5,2.5),
    `S 3` = c(7.798089,9.2058061,5.5408169,
              1.52159119,2.63042701,NA,1.3857699,-0.152939869,-0.050295909,
              -0.337659179,-0.058902499,-0.072916919,-0.410700949,
              -0.079817359,-0.313859499),
    `S 4` = c(1.41324408,9.6038562,1.71087962,
              2.95921938,4.82199712,3.17140358,1.15931318,NA,1.58997338,
              4.76858598,NA,-0.002674678,-0.235496858,0.065630452,
              -0.175745228),
    `S 5` = c(-0.167945369,1.41324408,1.41324408,
              0.741171721,2.494610191,-0.532343489,-0.358607189,
              -0.442774239,-0.103589789,0.213156301,-0.022826199,-0.096645979,
              1.215920941,3.377354481,0.033402621)
)

sample_data %>% 
    mutate(across(.fns = ~ . <= 0.9)) %>%
    rownames_to_column(var = "Entity") %>% 
    rowwise() %>% 
    mutate(prop = sum(c_across(starts_with("S")), na.rm = TRUE)) %>% 
    filter(prop >= 3) %>% 
    select(-prop)
#> # A tibble: 8 × 6
#> # Rowwise: 
#>   Entity    `S 1` `S 2` `S 3` `S 4` `S 5`
#>   <chr>     <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 Entity_6  TRUE  TRUE  NA    FALSE TRUE 
#> 2 Entity_7  TRUE  TRUE  FALSE FALSE TRUE 
#> 3 Entity_8  TRUE  TRUE  TRUE  NA    TRUE 
#> 4 Entity_9  NA    TRUE  TRUE  FALSE TRUE 
#> 5 Entity_10 TRUE  TRUE  TRUE  FALSE TRUE 
#> 6 Entity_11 TRUE  TRUE  TRUE  NA    TRUE 
#> 7 Entity_12 NA    TRUE  TRUE  TRUE  TRUE 
#> 8 Entity_15 FALSE FALSE TRUE  TRUE  TRUE

^{Created on 2022-06-03 by the reprex package (v2.0.1)}

I think you might want to learn data wrangling in general, you can read this free ebook to learn how to solve this kind of thing by yourself.

https://r4ds.had.co.nz/

mtoufiq · June 4, 2022, 8:42am

@andresrcs

Thank you very much, yes this R package is very helpful. I will refer this for data manipulation.

system · June 11, 2022, 8:42am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.