Hey
I was trying to scrape IMDB site. The rating information is not consistently present at all pages. Any suggestions to handle these type of missing values while web scraping.
My code as follows.
get_cast = function(movie_links)
{
movie_page = read_html(movie_links)
movie_cast = movie_page %>% html_nodes(".primary_photo+ td a") %>% html_text() %>% paste(collapse=(","))
return(movie_cast)
}
movies = data.frame()
for(page_result in seq(from=1,to=101,by=50))
{
next_page_link = paste0("https://www.imdb.com/search/title/?title_type=feature&year=2020-01-01,2020-12-31&start=",page_result,"&ref_=adv_nxt")
next_page_infor = read_html(next_page_link)
name = next_page_infor %>% html_nodes(".lister-item-header a") %>% html_text()
year = next_page_infor %>% html_nodes(".text-muted.unbold") %>% html_text() %>% str_trim()
synopsis = next_page_infor %>% html_nodes(".ratings-bar+ .text-muted") %>% html_text() %>% str_trim() %>% as.character()
rate = next_page_infor %>% html_nodes(".ratings-imdb-rating strong") %>% html_text() %>% as.numeric()
# Extract Page url section and paste with domain name for entire list.
movie_links = next_page_infor %>%
html_nodes(".lister-item-header a") %>%
html_attr("href") %>%
paste0("https://www.imdb.com",., sep="")
cast = sapply(movie_links,FUN = get_cast,USE.NAMES = FALSE)
scraped_info = rbind(movies,data.frame(name,year,synopsis,rate,cast,stringsAsFactors = FALSE))
print(paste0("Page No:",page_result))
}
Appreciate your time.
Regards
Karthik