Hey @anacho,
I was able to get the user info; however, it is important to realize that not all of them have that info on their user profile. Also, scraping the entire forum will take A LONG TIME. I tried running the algorithm for about 3 hours and it was still not done! I know I could have found ways to optimize the code, but it would still take a pretty long time. So, the code below only scrapes the first two threads. You may want to make the relevant changes to scrape everything. Finally, if the date is not available for a post (if it is NA), this means that the corresponding post is a reply to an actual post. Let me know if you have any questions.
library(dplyr)
library(rvest)
library(purrr)
library(RCurl)
library(stringr)
library(tidyr)
# Estimate the number of pages on the forum by dividing the number of pages by 20
page1_html <- getURL("https://www.medhelp.org/forums/Aspergers-Syndrome/show/191?page=1")
n_pages <- page1_html %>%
read_html() %>%
html_node("div.forum_title") %>%
html_text() %>%
str_extract_all("\\d+") %>%
flatten_chr() %>%
as.numeric() %>%
`[`(3) %>%
{. / 20}
# Get all thread titles and thread links
page_urls <- paste0("https://www.medhelp.org/forums/Aspergers-Syndrome/show/191?page=", seq_len(n_pages))
page_htmls <- map_chr(page_urls[1], getURL) # use page_urls instead of page_urls[1] if you want to scrape everything!
scrape_thread_titles <- function(html){
read_html(html) %>%
html_nodes(".subj_title a") %>%
html_text()
}
scrape_thread_links <- function(html){
read_html(html) %>%
html_nodes(".subj_title a") %>%
html_attr("href") %>%
paste0("https://www.medhelp.org", .)
}
thread_titles <- map(page_htmls, scrape_thread_titles) %>%
discard(~ length(.x) == 0)
correct_n_pages <- length(thread_titles)
thread_titles <- thread_titles %>%
flatten_chr()
thread_links <- map(page_htmls, scrape_thread_links) %>%
`[`(seq_len(correct_n_pages)) %>%
flatten_chr()
scraping_info <- tibble(thread_titles, thread_links)
# Scrape all thread posts and poster's IDs
thread_htmls <- map_chr(scraping_info$thread_links, getURL)
scrape_poster_ids <- function(html){
read_html(html) %>%
html_nodes(css = ".username a") %>%
html_text()
}
scrape_poster_dates <- function(html){
read_html(html) %>%
html_nodes(css = ".username .mh_timestamp") %>%
html_attr("datetime") %>%
as.Date(format = "%Y-%m-%d")
}
scrape_posts <- function(html){
read_html(html) %>%
html_nodes(".comment_body , .resp_body , #subject_msg") %>%
html_text() %>%
str_replace_all("\r|\n", "") %>%
str_trim()
}
scrape_poster_info_links <- function(html){
read_html(html) %>%
html_nodes(css = "div.username") %>%
html_children() %>%
html_attr("href") %>%
discard(is.na) %>%
paste0("https://www.medhelp.org", .)
}
scrape_poster_info <- function(links){
map_chr(links, ~ read_html(.x) %>%
html_node(".section:nth-child(1) .title+ span") %>%
html_text())
}
# SCRAPE INFORMATION FOR FIRST 2 THREADS ONLY
scraping_info_first_2 <- scraping_info[1:2, ]
htmls <- map(scraping_info_first_2$thread_links[1:2], getURL)
master_data_first_2 <-
scraping_info_first_2 %>%
mutate(
poster_ids = map(htmls, scrape_poster_ids),
date = map(htmls, scrape_poster_dates),
posts = map(htmls, scrape_posts),
poster_info_page_links = map(thread_links, scrape_poster_info_links),
poster_info_page_htmls = map(poster_info_page_links, ~ map(.x, getURL)),
poster_info = map(poster_info_page_links, scrape_poster_info)
)
# Unnest, keep relevant data and separate poster_info column into 2 columns
final_first_2 <-
master_data_first_2 %>%
unnest() %>%
select(-thread_links, -poster_info_page_links, -poster_info_page_htmls) %>%
separate(poster_info, into = c("sex", "age"), ", ") %>%
mutate_at(vars(age), as.numeric)
final_first_2
# A tibble: 39 x 6
thread_titles poster_ids date posts sex age
<chr> <chr> <date> <chr> <chr> <dbl>
1 Shutdown Mode LearningGF 2009-01-14 My boyfriend has Asperger's Sydrome. If he gets too confused, uncomfortable or hurt. He~ NA NA
2 Shutdown Mode MJIthewriter 2009-01-14 When I shut down it's feeling overwhelmed. imagine if you were thrown out in a hughway~ Fema~ 35
3 Shutdown Mode Sally44 2009-01-14 I have a son who will be 8 in February. When he gets overstimulated, or his expectatio~ Fema~ 54
4 Shutdown Mode MaryannesMom 2009-01-14 "My Aspie husband would go through cycles, every couple of months he would need to be a~ Fema~ NA
5 Shutdown Mode MJIthewriter 2009-01-14 Also headaches seem to trigger shutdowns. I had a bad one yesterday. Though the headach~ Fema~ 35
6 Shutdown Mode SueNYC 2009-01-15 "Though I would say that my husband definitely does not have Asperger's, he definitely ~ Fema~ 52
7 Shutdown Mode teburgan 2012-05-27 hi Sue, I wanted to let you know I u derstand. I should never have married my husband.~ NA NA
8 Shutdown Mode ryans93 2013-11-03 "I have had various shut downs. Our minds simply cannot comprehend or deal with the sit~ Male 37
9 Shutdown Mode nbarslou 2014-01-28 "My boyfriend of 9 months told me an old girlfriend said he had aspergers. My comment w~ NA NA
10 Shutdown Mode Debraydebor~ 2014-04-03 "So happy to read your post. I have been desperate for more information to help me in m~ Fema~ 67
# ... with 29 more rows