Hi, do any of you know how I can close connections related to my rvest-related connecting?
I'm not trying to suppress all warning messages here or anything like that -- I just want to close the connections so I don't get the warning messages related to closing unused connections.
1 thing I've tried is placing a on.exit(closeAllConnections())
somewhere in the function to try and do this if the function exits, but that doesn't seem to be working.
I've checked out other Stack Overflow questions/answers related to this, but nothing seems fruitful.
Thoughts? Here's the function below. Try running it and clicking escape and exiting early multiple times to see the warnings.
Thanks!
library(tidyverse)
library(rvest)
library(progress)
get_schedule <- function(season, ..., progress = TRUE) {
if (any(nchar(season) > 4) | any(!stringr::str_detect(season, "[0-9]{4,4}"))) {
cat("\n")
stop('\n\nMake sure your seasons are all 4-digit numbers
\rlike 1994 (for 1993-94) and 2017 (for 2016-17)\n\n')
}
if (any(season < 1988) | any(season > 1 + lubridate::year(Sys.Date())) | any(season == 2005)) {
cat("\n")
stop('\n\nOnly include seasons from (and including)
\r1988 to 2004 and from (and including)
\r2006 to whatever current season it is\n\n')
}
if (is.logical(progress) == FALSE) {
cat("\n")
stop('\n\nMake sure you set "progress" to either
\rTRUE or FALSE (it defaults to TRUE)\n\n')
}
league <- "NHL"
leagues <- league %>%
as_tibble() %>%
purrr::set_names("league") %>%
mutate_all(toupper) %>%
distinct()
seasons <- season %>%
as_tibble() %>%
purrr::set_names("season") %>%
mutate_all(as.numeric) %>%
distinct()
mydata <- tidyr::crossing(leagues, seasons)
if (progress) {
pb <- progress::progress_bar$new(format = "get_schedule() [:bar] :percent ETA: :eta", clear = FALSE, total = nrow(mydata), show_after = 0)
cat("\n")
pb$tick(0)
}
.get_schedule <- function(league, season, ...) {
seq(2.5, 3.5, by = 0.001) %>%
sample(1) %>%
Sys.sleep()
url <- str_c("https://www.hockey-reference.com/leagues/NHL_", season, "_games.html")
page <- url %>% read_html()
url_prefix <- case_when(league == "NHL" ~ "http://www.hockey-reference.com")
game_urls <- page %>%
html_nodes("#games th a") %>%
html_attr("href") %>%
str_c(url_prefix, .) %>%
as_tibble() %>%
purrr::set_names("url")
date <- page %>%
html_nodes("#games tbody th") %>%
html_text() %>%
as_tibble() %>%
mutate_all(as.character) %>%
purrr::set_names("date")
away_team <- page %>%
html_nodes("#games .left+ .left a") %>%
html_text() %>%
as_tibble() %>%
purrr::set_names("away_team")
home_team <- page %>%
html_nodes("#games td~ .left a") %>%
html_text() %>%
as_tibble() %>%
purrr::set_names("home_team")
away_goals <- page %>%
html_nodes("#games .right:nth-child(3)") %>%
html_text() %>%
as_tibble() %>%
purrr::set_names("away_goals")
home_goals <- page %>%
html_nodes("#games .right~ .left+ .right") %>%
html_text() %>%
as_tibble() %>%
purrr::set_names("home_goals")
overtime <- page %>%
html_nodes("#games td.center") %>%
html_text() %>%
as_tibble() %>%
purrr::set_names("overtime")
attendance <- page %>%
html_nodes("#games .right:nth-child(7)") %>%
html_text() %>%
as_tibble() %>%
purrr::set_names("attendance") %>%
mutate(attendance = str_replace_all(attendance, c("," = "")))
game_length <- page %>%
html_nodes("#games .right+ .right") %>%
html_text() %>%
as_tibble() %>%
purrr::set_names("messy_game_length") %>%
mutate(hours = str_split(messy_game_length, "\\:", simplify = TRUE, n = 2)[,1]) %>%
mutate(minutes = str_split(messy_game_length, "\\:", simplify = TRUE, n = 2)[,2]) %>%
mutate(game_length = ifelse(messy_game_length != "", as.numeric(hours) * 60 + as.numeric(minutes), as.numeric(NA))) %>%
select(game_length)
game_notes <- page %>%
html_nodes("#games .left") %>%
html_text() %>%
as_tibble() %>%
purrr::set_names("game_notes") %>%
filter(row_number() %% 4 == 0) %>%
filter(game_notes != "Notes")
schedule <- date %>%
bind_cols(away_team) %>%
bind_cols(away_goals) %>%
bind_cols(home_team) %>%
bind_cols(home_goals) %>%
bind_cols(overtime) %>%
bind_cols(attendance) %>%
bind_cols(game_length) %>%
bind_cols(game_notes) %>%
mutate(season = season) %>%
mutate(league = league) %>%
filter(away_goals != "" | home_goals != "") %>%
bind_cols(game_urls) %>%
mutate(season = str_c(season - 1, str_sub(season, 3, 4), sep = "-")) %>%
mutate_all(str_squish) %>%
mutate_all(~na_if(., "")) %>%
mutate_at(vars(away_goals, home_goals, attendance, game_length), as.numeric) %>%
distinct() %>%
select(league, season, date, away_team, away_goals, home_team, home_goals, overtime, attendance, game_length, game_notes, url)
if (progress) {pb$tick()}
return(schedule)
}
schedule_data <- map2_dfr(mydata[["league"]], mydata[["season"]], .get_schedule)
cat("\n")
return(schedule_data)
}
games <- get_schedule(2018:2019)