Any idea why I keep getting an HTTP error 403 with RSelenium?

Hey everybody, I keep getting an HTTP error 403 when using my scraper, and I'm confused why:

a. I get it in the first place
b. I sometimes don't get the error but other times do

Any ideas? Thanks!

library(tidyverse)
library(rvest)
library(RSelenium)
library(progress)


get_schedule <- function(league, season, ..., progress = TRUE) {
  
  mydata <- tidyr::crossing(league, season)
  
  if (progress) {
    
    pb <- progress::progress_bar$new(format = "get_schedule() [:bar] :percent eta: :eta", clear = FALSE, total = nrow(mydata), show_after = 0) 
    
    pb$tick(0)
    
  }
  
  .get_schedule <- function(league, season, ...) {
  
    if (league == "KHL") {  
    
      if (season == "2018-19") {url = "https://en.khl.ru/calendar/671/00/"}
      
      else if (season == "2017-18") {url = "https://en.khl.ru/calendar/468/00/"}
      else if (season == "2016-17") {url = "https://en.khl.ru/calendar/405/00/"}
      else if (season == "2015-16") {url = "https://en.khl.ru/calendar/309/00/"}
      else if (season == "2014-15") {url = "https://en.khl.ru/calendar/266/00/"}
      else if (season == "2013-14") {url = "https://en.khl.ru/calendar/244/00/"}
      else if (season == "2012-13") {url = "https://en.khl.ru/calendar/222/00/"}
      else if (season == "2011-12") {url = "https://en.khl.ru/calendar/202/00/"}
      else if (season == "2010-11") {url = "https://en.khl.ru/calendar/185/00/"}
      else if (season == "2009-10") {url = "https://en.khl.ru/calendar/167/00/"}
      else if (season == "2008-09") {url = "https://en.khl.ru/calendar/160/00/"}
      
      else {stop("Season not available. Sorry!")}
      
    }
    
    else {stop("League not available. Sorry!")}
    
    driver <- rsDriver(verbose = FALSE)
    
    driver$client$navigate(url)
        
    page <- driver$client$getPageSource() %>%
      purrr::pluck(1) %>%
      read_html()
    
    driver$client$close()
    driver$server$stop()
    
    schedule <- page %>%
      html_nodes("ul+ ul li:nth-child(1) a") %>%
      html_attr("href") %>%
      str_c("https://en.khl.ru", .) %>%
      as_tibble() %>%
      set_names("url") %>%
      mutate(season = season) %>%
      mutate(league = league) %>%
      distinct()
    
    return(schedule)
    
  }
  
  schedule_data <- map2_dfr(mydata[["league"]], mydata[["season"]], .get_schedule)
  
  return(schedule_data)
  
}

get_schedule("KHL", "2018-19")
#> Error in open.connection(con, "rb"): HTTP error 403.

Created on 2018-11-25 by the reprex package (v0.2.0).

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.