I'm trying to scrape a webpage using splashr, and I think that I need to reset my splashr container every time I run the function to scrape each link.
My issue is that, to do this, I run start_splash() and stop_splash() within the function, so that I should have a new container every time I run the function. However, this always fails, as I think my container isn't getting reset for some reason. I know this because I'll run splash_active() after I run the function, and it'll return TRUE, meaning that I still have an active container.
Here's the function in question:
get_box_score <- function(my_url) {
progress_bar$tick()$print()
sp <- start_splash()
Sys.sleep(sample(seq(0, 0.1, by = 0.001), 1))
render_html(url = my_url) %>%
html_node("#boxgoals") %>%
html_table() %>%
as_tibble()
stop_splash(sp)
}
Anyone know how to go about this? I attached some reproducible code below. Thanks!
library(tidyverse)
library(rvest)
library(splashr)
url <- "https://www.uscho.com/scoreboard/michigan/mens-hockey/"
get_data <- function(myurl) {
link_data <- myurl %>%
read_html() %>%
html_nodes("td:nth-child(13) a") %>%
html_attr("href") %>%
str_c("https://www.uscho.com", .) %>%
as_tibble() %>%
set_names("url")
game_type <- myurl %>%
read_html() %>%
html_nodes("td:nth-child(12)") %>%
html_text() %>%
as_tibble() %>%
filter(between(row_number(), 2, n())) %>%
set_names("game_type")
as_tibble(data.frame(link_data, game_type))
}
link_list <- get_data(url)
urls <- link_list %>%
filter(game_type != "EX") %>%
pull(url)
get_box_score <- function(my_url) {
progress_bar$tick()$print()
sp <- start_splash()
Sys.sleep(sample(seq(0, 0.1, by = 0.001), 1))
render_html(url = my_url) %>%
html_node("#boxgoals") %>%
html_table() %>%
as_tibble()
stop_splash(sp)
}
persistently_get_box_score <- warrenr::persistently(get_box_score, max_attempts = 15, wait_seconds = 0.001)
try_get_box_score <- function(url) {
tryCatch(persistently_get_box_score(url), error = function(e) {data.frame()})
}
progress_bar <- link_list %>%
filter(game_type != "EX") %>%
tally() %>%
progress_estimated(min_time = 0)
mydata <- pmap_df(list(urls), try_get_box_score)