Hi all,
I am definitely still a novice so I'm sorry for what is probably a dumb question! I'm trying to scrape text from a forum (Effects, Pedals, Strings & Things | The Gear Page) to run some topic modeling. I put together some code based on examples from other threads, but it's not working; no data is being output. See below... Any ideas for what I'm doing wrong? I've never done anything web-based before, just rudimentary stats and text mining on pre-formatted datasets. Thanks in advance!!!
# HELPER FUNCTIONS
# Get thread info (thread title and url)
scrape_thread_info <- function(page_url){
html <- page_url %>%
read_html() %>%
html_nodes(css = ".structItem-title+ a")
tibble(
title = html %>% html_text(),
url = html %>% html_attr(name = "href")
)
}
# Check if thread has a single page
has_single_page <- function(thread_url){
thread_url %>%
read_html() %>%
html_node(css = ".block-outer-main") %>%
html_text() %>%
is.na()
}
# Scrape posts
scrape_posts <- function(thread_link){
thread_link %>%
read_html() %>%
html_nodes(css = ".bbWrapper") %>%
html_text() %>%
str_squish()
}
##################
# MAIN FUNCTION FOR SCRAPING
scrape_tgp_pages <- function(pages){
# Find number of pages in the forum and get the link for each page
page_1_url <- "https://www.thegearpage.net/board/index.php?forums/effects-pedals-strings-things.4/"
page_1_html <- read_html(page_1_url)
n_pages <- page_1_html %>%
html_nodes(css = ".pageNav-main .pageNav-page") %>%
html_text() %>%
str_extract(pattern = "\\d+") %>%
as.numeric() %>%
max(na.rm = TRUE)
page_urls <- c(
page_1_url,
paste0("https://www.thegearpage.net/board/index.php?forums/effects-pedals-strings-things.4/page-", 2:n_pages, ".html")
)[pages]
# Get threads info for the pages of interest
master <- map_dfr(page_urls, scrape_thread_info)
master %>%
mutate(
posts = url %>% map(scrape_posts)
)
}
# EXAMPLE TO SCRAPE PAGES 1-2
forum_data <- scrape_tgp_pages(pages = c(1, 2))
forum_data