Hi All,
I'm trying to scrape the forum site listed in the code below.
I've adopted gueyenono's code from another post on forum scraping with rvest, but I generate a couple errors:
- "Error in 2:n_pages : result would be too long a vector"
- if I put 2:3 for the pages, just to test, I get:
"In max(., na.rm = TRUE) : no non-missing arguments to max; returning -Inf
Can anyone help me with this please?
Here is the code I'm using:
'''
library(rvest)
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)
HELPER FUNCTIONS
Get thread info (thread title and url)
scrape_thread_info <- function(page_url){
html <- page_url %>%
read_html() %>%
html_nodes(css = ".icons+ a")
tibble(
title = html %>% html_text(),
url = html %>% html_attr(name = "href")
)
}
Check if thread has a single page
has_single_page <- function(thread_url){
thread_url %>%
read_html() %>%
html_node(css = ".brace") %>%
html_text() %>%
is.na()
}
Scrape posts
scrape_posts <- function(thread_link){
thread_link %>%
read_html() %>%
html_nodes(css = ".message_data") %>%
html_text() %>%
str_squish()
}
MAIN FUNCTION FOR SCRAPING
scrape_gs_pages <- function(pages){
Find number of pages in the forum and get the link for each page
page_1_url <- "https://www.polarisatvforums.com/forums/atv-general-discussion.7/"
page_1_html <- read_html(page_1_url)
n_pages <- page_1_html %>%
html_nodes(css = ".feedItem-thread-information") %>%
html_text() %>%
str_extract(pattern = "\d+") %>%
as.numeric() %>%
max(na.rm = TRUE)
page_urls <- c(
page_1_url,
paste0("https://www.polarisatvforums.com/forums/atv-general-discussion.7/", 2:n_pages, ".html")
)[pages]
Get threads info for the pages of interest
master <- map_dfr(page_urls, scrape_thread_info)
master %>%
mutate(
posts = url %>% map(scrape_posts)
)
}
forum_data <- scrape_gs_pages(pages = c(1, 2)) %>%
unnest(cols = "posts") %>%
mutate(posts = str_replace_all(string = posts, pattern = '\"', replacement = ""))
forum_data
'''