I'm looking to find a method retry some web-scraping functions in a package I'm writing. Right now, the simplest method I've found is warrenr::persistently(), which works fine, but I'm trying to reduce my package's dependencies.
Any ideas?
If you want to see a reprex for whatever reason, here's a function that sometimes poses issues:
library(tidyverse)
library(progress)
get_teams <- function(.league, .season, .progress = FALSE, ...) {
leagues <- .league %>%
as_tibble() %>%
set_names(".league") %>%
mutate(.league = str_replace_all(.league, " ", "-"))
seasons <- .season %>%
as_tibble() %>%
set_names(".season")
mydata <- tidyr::crossing(leagues, seasons)
if (.progress) {pb <- progress::progress_bar$new(format = ":what [:bar] :percent eta: :eta", clear = FALSE, total = nrow(mydata), width = 60)}
league_team_data <- map2_dfr(mydata[[".league"]], mydata[[".season"]], function(.league, .season, ...) {
if (.progress) {pb$tick(tokens = list(what = "get_teams()"))}
seq(5, 10, by = 0.001) %>%
sample(1) %>%
Sys.sleep()
page <- str_c("https://www.eliteprospects.com/league/", .league, "/", .season) %>% read_html()
team_url <- page %>%
html_nodes("#standings .team a") %>%
html_attr("href") %>%
str_c(., "?tab=stats") %>%
as_tibble() %>%
set_names("team_url")
team <- page %>%
html_nodes("#standings .team a") %>%
html_text() %>%
str_trim(side = "both") %>%
as_tibble() %>%
set_names("team")
league <- page %>%
html_nodes("small") %>%
html_text() %>%
str_trim(side = "both")
season <- str_split(.season, "-", simplify = TRUE, n = 2)[,2] %>%
str_sub(3, 4) %>%
str_c(str_split(.season, "-", simplify = TRUE, n = 2)[,1], ., sep = "-")
all_data <- team %>%
bind_cols(team_url) %>%
mutate(league = league) %>%
mutate(season = season)
return(all_data)
})
return(league_team_data)
}