Hey @anacho,
I was able to complete your first request, which was to scrape the author IDs in each thread. I had to change a few variable and function names. I also used the RCurl::getURL() function to save the htmls from all links into a variable and then scrape the data of interest from the variable. This is a good practice because the code repeatedly scrapes directly from the website and some websites will lock you out for doing so.
library(rvest)
library(dplyr)
library(stringr)
library(purrr)
library(tidyr)
library(RCurl)
# Scrape thread titles, thread links, authors and number of views
url <- "https://www.healthboards.com/boards/aspergers-syndrome/"
h <- read_html(url)
threads <- h %>%
html_nodes("#threadslist .alt1 a") %>%
html_text()
thread_links <- h %>%
html_nodes("#threadslist .alt1 a") %>%
html_attr(name = "href")
thread_starters <- h %>%
html_nodes("#threadslist .alt1 .smallfont") %>%
html_text() %>%
str_replace_all(pattern = "\t|\r|\n", replacement = "")
views <- h %>%
html_nodes(".alt2:nth-child(6)") %>%
html_text() %>%
str_replace_all(pattern = ",", replacement = "") %>%
as.numeric()
# Custom functions to scrape author IDs and posts
scrape_posts <- function(link){
read_html(link) %>%
html_nodes(css = ".smallfont~ hr+ div") %>%
html_text() %>%
str_replace_all(pattern = "\t|\r|\n", replacement = "") %>%
str_trim()
}
scrape_author_ids <- function(link){
h <- read_html(link) %>%
html_nodes("div")
id_index <- h %>%
html_attr("id") %>%
str_which(pattern = "postmenu")
h %>%
`[`(id_index) %>%
html_text() %>%
str_replace_all(pattern = "\t|\r|\n", replacement = "") %>%
str_trim()
}
# Create master dataset
htmls <- map(thread_links, getURL)
master_data <-
tibble(threads, thread_starters, views, thread_links) %>%
mutate(
post_author_id = map(htmls, scrape_author_ids),
post = map(htmls, scrape_posts)
) %>%
select(threads:views, post_author_id, post, thread_links) %>%
unnest()
head(master_data)
threads thread_starters views thread_links post_author_id post
<chr> <chr> <dbl> <chr> <chr> <chr>
1 ADHD And Aspergers MyNameIsCrazy 5021 https://www.healthboards.com/boards/asperge~ MyNameIsCrazy I have adhd and asperger syndrome and was wondering abou~
2 ADHD And Aspergers MyNameIsCrazy 5021 https://www.healthboards.com/boards/asperge~ Dragonfly Win~ Hi there,My son has both, I have Inattentive ADHD and un~
3 ADHD And Aspergers MyNameIsCrazy 5021 https://www.healthboards.com/boards/asperge~ DuckyBaby03 Hello, I understand what your going through. I also have~
4 Adult Pants Pooping~ poopypants21 1705 https://www.healthboards.com/boards/asperge~ poopypants21 I am a 42 year old male with Asperger's Syndrome and occ~
5 Adult Pants Pooping~ poopypants21 1705 https://www.healthboards.com/boards/asperge~ 7ash7 Hi, to help answer your question, do you conciously and/~
6 Adult Pants Pooping~ poopypants21 1705 https://www.healthboards.com/boards/asperge~ poopypants21 Accidentally. My GF does wear cloth diapers because she ~
As for your second request, I am not sure how you accessed the "About me" page on the website.
Hope this helps.