Any idea why splashr doesn't scrape all the data I want?

splashr
web-scraping

#1

I'm trying to scrape the data within #boxgoals for each scheduled game in my url. However, it seems like splashr stops scraping after 155 observations in mydata, or -- what I believe to be -- 25 games. Any idea if this is a splashr issue, or something else? Thanks!

library(tidyverse)
library(splashr)
library(rvest)

sp <- start_splash()

url <- "https://www.uscho.com/scoreboard/michigan/mens-hockey/"  

get_data <- function(myurl) {
  
  link_data <- myurl %>%
    read_html() %>%
    html_nodes("td:nth-child(13) a") %>%
    html_attr("href") %>%
    str_c("https://www.uscho.com", .) %>%
    as_tibble() %>%
    set_names("url")
  
  game_type <- myurl %>%
    read_html() %>%
    html_nodes("td:nth-child(12)") %>%
    html_text() %>%
    as_tibble() %>%
    filter(between(row_number(), 2, n())) %>%
    set_names("game_type")

  as_tibble(data.frame(link_data, game_type))
  
}

link_list <- get_data(url)


urls <- link_list %>%
  filter(game_type != "EX") %>%
  pull(url)

get_box_score <- function(my_url) {
  
  progress_bar$tick()$print()
  
  Sys.sleep(sample(seq(0, 0.1, by = 0.001), 1))
  
  render_html(url = my_url) %>%
    html_node("#boxgoals") %>%
    html_table() %>%
    as_tibble()
}

persistently_get_box_score <- warrenr::persistently(get_box_score, max_attempts = 15, wait_seconds = 0.001)

try_get_box_score <- function(url) {
  tryCatch(persistently_get_box_score(url), error = function(e) {data.frame()})
}

progress_bar <- link_list %>%
  filter(game_type != "EX") %>%
  tally() %>%
  progress_estimated(min_time = 0)


mydata <- pmap_df(list(urls), try_get_box_score)

stop_splash(sp)

Difficulties with updating/adding/removing objects from within a function
#2

Okay. Possible solution here. Instead of doing start_splash() before my whole scrape and then doing stop_splash() after everything, I'm trying to do start_splash() and stop_splash() within each scrape. So like this:

library(rvest)

url <- "https://www.uscho.com/scoreboard/michigan/mens-hockey/"  

get_data <- function(myurl) {
  
  link_data <- myurl %>%
    read_html() %>%
    html_nodes("td:nth-child(13) a") %>%
    html_attr("href") %>%
    str_c("https://www.uscho.com", .) %>%
    as_tibble() %>%
    set_names("url")
  
  game_type <- myurl %>%
    read_html() %>%
    html_nodes("td:nth-child(12)") %>%
    html_text() %>%
    as_tibble() %>%
    filter(between(row_number(), 2, n())) %>%
    set_names("game_type")

  as_tibble(data.frame(link_data, game_type))
  
}

link_list <- get_data(url)


urls <- link_list %>%
  filter(game_type != "EX") %>%
  pull(url)

get_box_score <- function(my_url) {
  
  progress_bar$tick()$print()

  sp <- start_splash()
  
  Sys.sleep(sample(seq(0, 0.1, by = 0.001), 1))
  
  render_html(url = my_url) %>%
    html_node("#boxgoals") %>%
    html_table() %>%
    as_tibble()

  stop_splash(sp)
}

persistently_get_box_score <- warrenr::persistently(get_box_score, max_attempts = 15, wait_seconds = 0.001)

try_get_box_score <- function(url) {
  tryCatch(persistently_get_box_score(url), error = function(e) {data.frame()})
}

progress_bar <- link_list %>%
  filter(game_type != "EX") %>%
  tally() %>%
  progress_estimated(min_time = 0)


mydata <- pmap_df(list(urls), try_get_box_score)

I'm in the process of seeing if this works. Hopefully it will.


#3

Ok it doesn't. Anyone know why?