Is there a way to do a filtered rank that preserves a full data frame?

dplyr

#1

I'm doing data analysis on a large data.frame. Is there a way to do a rank based on filtered criteria that preserves the full data.frame? The code would work similar to using dplyr filter results, but preserve the contents of the whole data.frame after the mutate?

I've pasted my code below. df_calc was my attempt at a solution, but the ranks are being done on the aggregate list of players, not players that meet the ifelse criteria. df_calc_correct gives me the desired results, but I'm looking for those ranks to be part of the original df_calc data.frame, with any values that don't meet the criteria to be listed as "NA".

library(tidyverse)

# Data
df <- read.table(sep="\t", text="
  namePlayer    groupPosition   minutesTotals   fgmTotals   fgaTotals   fg3mTotals  fg3aTotals  fg2mTotals  fg2aTotals  ftmTotals   ftaTotals   orbTotals   drbTotals   trbTotals   astTotals   stlTotals   blkTotals   tovTotals   pfTotals    ptsTotals
                 Anthony Davis  C   1267    353 698  34 105 319 593 236 294 114 340 454 151 58  90   71  86  976
                 Bradley Beal   G   1392    336 714  89 262 247 452 137 174  36 148 184 180 47  32   98 115  898
                 Damian Lillard G   1347    335 741 117 297 218 444 229 256  30 151 181 219 35  19  104  69 1016
                 Giannis Antetokounmpo  F   1146    335 572  12  79 323 493 212 305  83 346 429 207 45  54  145 116  894
                 James Harden   G   1261    331 752 162 416 169 336 318 374  26 175 201 291 70  19  189 116 1142
                 Joel Embiid    C   1255    330 680  42 142 288 538 295 368  89 413 502 129 21  70  128 125  997
                 Kemba Walker   G   1276    324 734 122 335 202 399 171 208  16 142 158 223 48  18   89  61  941
                 Kevin Durant   F   1399    383 760  70 192 313 568 274 300  19 274 293 237 32  39  133  74 1110
                 LeBron James   F   1178    340 656  68 191 272 465 180 264  32 251 283 243 44  24  116  54  928
                 Paul George    F   1271    332 734 119 315 213 419 179 215  56 235 291 146 82  22   99 106  962", header=TRUE, stringsAsFactors=FALSE)

df_calc <- df %>%
  # Overall Rank
  mutate(o_rank = rank(desc(ptsTotals))) %>%
  # Rank by Position
  group_by(groupPosition) %>%
  mutate(position_rank = rank(desc(ptsTotals))) %>%
  ungroup() %>%
  # Conditional Rank
  mutate(custom_rank = ifelse(groupPosition %in% c("G", "F") & position_rank > 3 |
                              groupPosition =="C" & position_rank > 3, rank(desc(ptsTotals)), NA ))


df_calc_correct <-  df %>%
  # Overall Rank
  mutate(o_rank = rank(desc(ptsTotals))) %>%
  # Rank by Position
  group_by(groupPosition) %>%
  mutate(position_rank = rank(desc(ptsTotals))) %>%
  ungroup() %>%
  # Conditional Rank
  filter(groupPosition %in% c("G", "F") & position_rank > 3 |
                                groupPosition =="C" & position_rank > 3) %>% 
  mutate(custom_rank = rank(desc(ptsTotals)))
          
df_calc
#> # A tibble: 10 x 23
#>    namePlayer groupPosition minutesTotals fgmTotals fgaTotals fg3mTotals
#>    <chr>      <chr>                 <int>     <int>     <int>      <int>
#>  1 "        ~ C                      1267       353       698         34
#>  2 "        ~ G                      1392       336       714         89
#>  3 "        ~ G                      1347       335       741        117
#>  4 "        ~ F                      1146       335       572         12
#>  5 "        ~ G                      1261       331       752        162
#>  6 "        ~ C                      1255       330       680         42
#>  7 "        ~ G                      1276       324       734        122
#>  8 "        ~ F                      1399       383       760         70
#>  9 "        ~ F                      1178       340       656         68
#> 10 "        ~ F                      1271       332       734        119
#> # ... with 17 more variables: fg3aTotals <int>, fg2mTotals <int>,
#> #   fg2aTotals <int>, ftmTotals <int>, ftaTotals <int>, orbTotals <int>,
#> #   drbTotals <int>, trbTotals <int>, astTotals <int>, stlTotals <int>,
#> #   blkTotals <int>, tovTotals <int>, pfTotals <int>, ptsTotals <int>,
#> #   o_rank <dbl>, position_rank <dbl>, custom_rank <dbl>

df_calc_correct
#> # A tibble: 2 x 23
#>   namePlayer groupPosition minutesTotals fgmTotals fgaTotals fg3mTotals
#>   <chr>      <chr>                 <int>     <int>     <int>      <int>
#> 1 "        ~ G                      1392       336       714         89
#> 2 "        ~ F                      1146       335       572         12
#> # ... with 17 more variables: fg3aTotals <int>, fg2mTotals <int>,
#> #   fg2aTotals <int>, ftmTotals <int>, ftaTotals <int>, orbTotals <int>,
#> #   drbTotals <int>, trbTotals <int>, astTotals <int>, stlTotals <int>,
#> #   blkTotals <int>, tovTotals <int>, pfTotals <int>, ptsTotals <int>,
#> #   o_rank <dbl>, position_rank <dbl>, custom_rank <dbl>
Created on 2019-01-04 by the reprex package (v0.2.1)

#2

One solution is to compute the conditional ranks separately, then join with the full table.

df_calc_correct <-  df %>%
  # Overall Rank
  mutate(o_rank = rank(desc(ptsTotals))) %>%
  # Rank by Position
  group_by(groupPosition) %>%
  mutate(position_rank = rank(desc(ptsTotals))) %>%
  ungroup() %>% 
  mutate(key=seq_len(nrow(.)))

cond_rank = df_calc_correct %>%
  # Conditional Rank
  filter(groupPosition %in% c("G", "F") & position_rank > 3 |
           groupPosition =="C" & position_rank > 3) %>% 
  mutate(custom_rank = rank(desc(ptsTotals))) %>% 
  select(key, custom_rank)

df_calc_correct = left_join(df_calc_correct, cond_rank) %>% 
  select(-key)

#3

Another option is to create a new points column containing NA for the ones you don't want to rank:

df_calc_correct <-  df %>%
  # Overall Rank
  mutate(o_rank = rank(desc(ptsTotals))) %>%
  # Rank by Position
  group_by(groupPosition) %>%
  mutate(position_rank = rank(desc(ptsTotals))) %>%
  ungroup() %>% 
  # Conditional Rank
  mutate(custom_pts = ifelse(groupPosition %in% c("G", "F") & position_rank > 3 |
           groupPosition =="C" & position_rank > 3, ptsTotals, NA)) %>% 
  mutate(custom_rank = rank(desc(custom_pts), na.last='keep')) %>% 
  select(-custom_pts)

#4

Thanks Kent. I think I like this solution better.


#5

If your question's been answered (even if by you), would you mind choosing a solution? (See FAQ below for how).

Having questions checked as resolved makes it a bit easier to navigate the site visually and see which threads still need help.

Thanks


#6

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.