Help with rvest and forum scraping

I came across the code provided by @gueyenono in the link below and modified it to use in my case of scraping forum posts from "http://bitcointalk.org/index.php?board=8.0" but got error.

(As I am a new member, I can't put web page url twice in the post, therefore I wrote as " web page" in the code below.)
Can anyone help me through this? I want to source post title, post text with all replies, and dates.

Many thanks,

library(rvest)
library(dplyr)
library(purrr)
library(stringr)
library(tidyr)

===============================================================

HELPER FUNCTIONS

===============================================================

Get thread info (thread title and url)

scrape_thread_info <- function(page_url){

html <- page_url %>%
read_html() %>%
html_nodes(css = ".windowbg3")

tibble(
title = html %>% html_text(),
url = html %>% html_attr(name = "href")
)

}

Check if thread has a single page

has_single_page <- function(thread_url){

thread_url %>%
read_html() %>%
html_node(css = ".navpages") %>%
html_text() %>%
is.na()
}

Scrape posts

scrape_posts <- function(thread_link){

thread_link %>%
read_html() %>%
html_nodes(css = ".post") %>%
html_text() %>%
str_squish()

}

================================================================

MAIN FUNCTION FOR SCRAPING

================================================================

scrape_gs_pages <- function(pages){

Find number of pages in the forum and get the link for each page

page_1_url <- "web page"
page_1_html <- read_html(page_1_url)

n_pages <- page_1_html %>%
html_nodes(css = "#toppages") %>%
html_text() %>%
str_extract(pattern = "\d+") %>%
as.numeric() %>%
max(na.rm = TRUE)

page_urls <- c(
page_1_url,
paste0("web page", 2:n_pages, ".html")
)[pages]

Get threads info for the pages of interest

master <- map_dfr(page_urls, scrape_thread_info)

master %>%
mutate(
posts = url %>% map(scrape_posts)
)

}

forum_data <- scrape_gs_pages(pages = c(1, 2)) %>%
unnest() %>%
mutate(posts = str_replace_all(string = posts, pattern = '\"', replacement = ""))

forum_data

The code below scrapes the information you are looking for. I did not have any success scraping many threads at the same time. The website kept locking me out! So you will realize that in my code (at the very bottom), I only scrape the first thread just to show you how it works. You may want to edit the code if needed.

The code is divided in four parts:

(i) I verify if it is legal to scrape from the website. Yes it is!
(ii) I load the necessary packages
(iii) I create the scraping functions
(iv) I show you an example

# Is is legal to scrape data from this website? ---------------------------

robotstxt::paths_allowed("https://bitcointalk.org/index.php?topic=454795.280") # YES!


# Load packages -----------------------------------------------------------

library(here)
library(rvest)
library(dplyr)
library(stringr)
library(glue)
library(purrr)
library(furrr)
library(lubridate)



# Custom scraping functions -----------------------------------------------

generate_page_urls <- function(n_pages = 1){
  
  main_url_page <- "https://bitcointalk.org/index.php?board=8.0"
  
  if(n_pages == 1){
    
    extra_urls <- NULL
    
  } else if(n_pages >= 2 & n_pages <= 367){ # 367 is the number of pages of the forum when this function was written
    
    extra_pages <- 2:n_pages
    extra_urls <- glue("https://bitcointalk.org/index.php?board=8.{(extra_pages-1)*4}0")
    
  }
  
  if(n_pages > 367){ # 367 is the number of pages of the forum when this function was written
    
    html <- read_html(x = main_url_page)
    
    max_n_pages <- html %>%
      html_nodes(css = ".navPages") %>%
      html_text() %>%
      as.numeric() %>%
      max(na.rm = TRUE)
    
    if(n_pages > max_n_pages){
      stop(glue("There are only {max_n_pages} pages in the forum."))
    }
    
  }
  
  c(main_url_page, extra_urls)
  
}

scrape_meta_info <- function(url){
  
  html <- read_html(url)
  
  subject <- html %>%
    html_nodes(css = ".leftimg+ td span a") %>%
    html_text()
  
  subject_author <- html %>%
    html_nodes(css = ".windowbg2 > a") %>%
    html_text() %>%
    discard(~ .x == "")
  
  link <- html %>%
    html_nodes(css = ".leftimg+ td span a") %>%
    html_attr(name = "href")
  
  
  tibble(subject, subject_author, link)
  
}


scrape_subject_thread <- function(thread_url){
  
  html <- read_html(thread_url)
  
  # Sys.sleep(5)
  
  # How many pages are there in the thread?
  
  n_pages <- html %>% 
    html_nodes(css = ".navPages") %>%
    html_text() %>%
    as.numeric() %>%
    max(na.rm = TRUE)
  n_pages <- ifelse(n_pages == -Inf, 1, n_pages)
  
  if(n_pages == 1){
    
    page_urls <- thread_url
    
  } else {
    
    extra_pages <- 2:n_pages
    url_root <- str_extract(thread_url, "^.*\\.")
    extra_pages_urls <- glue("{url_root}{(extra_pages-1)*2}0")
    page_urls <- c(thread_url, extra_pages_urls)
    
  }
  
  htmls <- future_map(page_urls, read_html)
  
  post_author <- future_map(htmls, function(x){
    x %>%
      html_nodes(css = ".poster_info b a") %>%
      html_text() %>% 
      str_subset(pattern = "^1587", negate = TRUE)
  }) %>% 
    flatten_chr()
  
  post <- future_map(htmls, function(x){
    x %>%
      html_nodes(css = ".post") %>%
      html_text() %>% 
      str_subset(pattern = "^1587", negate = TRUE)
  }) %>% 
    flatten_chr()
  
  date <- future_map(htmls, function(x){
    x %>%
      html_nodes(css = ".subject+ .smalltext , .edited") %>% 
      html_text() %>% 
      str_subset(pattern = "^1587", negate = TRUE) %>%
      str_subset(pattern = "edit", negate = TRUE)
  }) %>%
    flatten_chr() %>%
    as_datetime(format = "%B %d, %Y, %I:%M:%S %p")
  
  
  tibble(date, post_author, post)
  
}



# Scrape the data ---------------------------------------------------------

# 1. Generate the page URLs by specifying the number of pages you want to scrape (I am doing it with the first two pages)
urls <- generate_page_urls(n_pages = 2)

# 2. Scrape the page meta info
meta_info <- future_map_dfr(urls, scrape_meta_info)

# 3. Scrape the data from the first subject only
scrape_subject_thread(thread_url = meta_info$link[1])

# A tibble: 465 x 3
   date                post_author     post                                                                                                          
   <dttm>              <chr>           <chr>                                                                                                         
 1 2014-02-08 03:10:25 Maged           "What is KYC ?Know your customer (KYC) refers to due diligence activities that financial institutions and oth~
 2 2014-02-08 03:16:27 Maged           "Want to improve this sticky? Edits to the sticky may be paid! Please post a quote of this sticky with all of~
 3 2014-02-22 02:21:44 repentance      "To expand a bit on what all this means at local level.Services design their own AML/KYC policies and risk ma~
 4 2014-02-24 02:39:03 AsiaNexgen      "Can ANX get added to the list?  We are a Money Services Operator (MSO) in Hong Kong.  This is the equivalent~
 5 2014-02-24 03:53:54 Himself         "Add CoinMKT and VoS, please."                                                                                
 6 2014-03-04 12:07:53 mateo           "There's also https://bitcoin-central.net/ operated by Paymium"                                               
 7 2014-04-08 08:14:22 Sandia          "Quote from: Maged on February 08, 2014, 03:10:25 AM- Determination of the customer's risk in terms of propen~
 8 2014-05-05 13:59:15 Flashman        "At what point does supposed KYC/AML compliance requirement blocking access to amounts worth below the report~
 9 2014-05-28 17:59:03 btcsup          "Important aspects. But i think MTGOX should be removed from this nice article. When i see it's name i become~
10 2014-06-17 00:38:41 moriartybitcoin "AML/KYC is a barrier to entry for small Bitcoin startups and effectively hinders Bitcoin innovation .. it sh~
# ... with 455 more rows
Warning message:
In function_list[[i]](value) : NAs introduced by coercion

# if you want to scrape several thread (the first 3 subjects for example), run: future_map_dfr(meta_info$link[1:3], scrape_subject_thread)

Thanks a lot. I am trying to run it. Just one question, with css=.post, I am scraping also quotes which I do not want to scrap. I tried css= ""div.post:not(.quote:nth-child(2))" but it doesn't work either. How can I exclude quotes in each thread?

I'm afraid that it's not as simple as that and I honestly do not know how to leave quotes out.

@gueyenono
Hi again, I have found this page :


xml_remove could be a solution, tried to integrate it into scrape_subject_thread function but couldn't work properly.

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.