@mara and @rexevan. Here's the code I've come up with. Is there a more efficient way to get to this?
Also, say I want to use the variables in the slope table in a separate calculations. (The value for the slope of weighted and HR). Is there an effective way to do this?
library(tidyverse)
library(broom)
league_stats <- tibble::tribble(
~season, ~team, ~hr, ~sb, ~rbi, ~w, ~sv, ~era,
2016L, "Red Sox", 208L, 83L, 836L, 93L, 43L, 4,
2017L, "Astros", 238L, 98L, 854L, 101L, 45L, 4.12,
2018L, "Athletics", 227L, 35L, 778L, 97L, 44L, 3.82,
2018L, "Red Sox", 208L, 125L, 829L, 108L, 46L, 3.75,
2018L, "Yankees", 267L, 63L, 821L, 100L, 49L, 3.78,
2017L, "Yankees", 241L, 90L, 821L, 91L, 36L, 3.75,
2018L, "Indians", 216L, 135L, 786L, 91L, 41L, 3.77,
2017L, "Indians", 212L, 88L, 780L, 102L, 37L, 3.3,
2018L, "Astros", 205L, 71L, 763L, 103L, 46L, 3.11,
2015L, "Astros", 230L, 121L, 691L, 86L, 39L, 3.57,
2016L, "Indians", 185L, 134L, 733L, 94L, 37L, 3.86,
2016L, "Astros", 198L, 102L, 689L, 84L, 44L, 4.06,
2015L, "Yankees", 212L, 63L, 737L, 87L, 48L, 4.03,
2015L, "Indians", 141L, 86L, 640L, 81L, 38L, 3.68,
2015L, "Red Sox", 161L, 71L, 706L, 78L, 40L, 4.34,
2017L, "Red Sox", 168L, 106L, 735L, 93L, 39L, 3.73,
2017L, "Athletics", 234L, 57L, 708L, 75L, 35L, 4.67,
2016L, "Yankees", 183L, 72L, 647L, 84L, 48L, 4.16,
2015L, "Athletics", 146L, 78L, 661L, 68L, 28L, 4.16,
2016L, "Athletics", 169L, 50L, 634L, 69L, 42L, 4.51
)
long_league_stats <- league_stats %>%
select(team, season, hr, sb, rbi, w, sv, era) %>%
gather(key = stat, value = "value", 3:8 ) %>%
mutate(weight = case_when(
season == max(season) ~ .45,
season == (max(season) - 1) ~ .35,
season == (max(season) - 2) ~ .20,
season <= (max(season) - 3) ~ .00,
TRUE ~ NA_real_
))
#Ranks by Season Long
ranks_by_season_long <- long_league_stats %>%
group_by(season, stat) %>%
mutate(rank = if_else(stat == "era", rank(value, ties.method = "first"), rank(-value, ties.method = "first")),
points = if_else(stat == "era", rank(-rank, ties.method = "first"), rank(-rank, ties.method = "first")),
wtd_value = value * weight) %>%
ungroup()
ranks_by_season <- ranks_by_season_long %>%
select(season, stat, rank, value, points) %>%
spread(stat, value) %>%
mutate(season = as.character(as.numeric(season)))
weighted <- ranks_by_season_long %>%
group_by(stat, rank, points) %>%
summarize(weighted = sum(wtd_value)) %>%
spread(stat, weighted) %>%
mutate(season = "weighted")
average_3yr <- ranks_by_season_long %>%
filter(season > max(season) - 3) %>%
group_by(stat, rank, points) %>%
summarize(avg_3yr = sum(value / 3)) %>%
spread(stat, avg_3yr) %>%
mutate(season = "avg_3yr")
stats_table_long <- bind_rows(ranks_by_season, weighted, average_3yr) %>%
gather(key = stat, value = value, era: w)
stats_table_long
#> # A tibble: 180 x 5
#> season rank points stat value
#> <chr> <int> <int> <chr> <dbl>
#> 1 2015 1 5 era 3.57
#> 2 2015 2 4 era 3.68
#> 3 2015 3 3 era 4.03
#> 4 2015 4 2 era 4.16
#> 5 2015 5 1 era 4.34
#> 6 2016 1 5 era 3.86
#> 7 2016 2 4 era 4
#> 8 2016 3 3 era 4.06
#> 9 2016 4 2 era 4.16
#> 10 2016 5 1 era 4.51
#> # ... with 170 more rows
slope <- stats_table_long %>%
group_by(stat, season) %>%
do(tidy(lm(value~ points, data = .))) %>%
filter(term == "points") %>%
select(season, stat, estimate) %>%
rename(slope = estimate) %>%
spread(stat, slope) %>%
select(season, hr, sb, rbi, w, sv, era)
slope
#> # A tibble: 6 x 7
#> # Groups: season [6]
#> season hr sb rbi w sv era
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2015 24.4 13.1 23.9 4.6 4.20 -0.202
#> 2 2016 9.30 19.8 49.0 5.9 2.40 -0.146
#> 3 2017 17.2 10.8 37.8 6.4 2.30 -0.313
#> 4 2018 14.3 26.2 17.5 4.00 1.80 -0.145
#> 5 avg_3yr 13.6 18.9 34.8 5.43 2.17 -0.201
#> 6 weighted 14.3 19.5 30.9 5.22 2.09 -0.204