Web Scraping with Missing Values

Hey
I was trying to scrape IMDB site. The rating information is not consistently present at all pages. Any suggestions to handle these type of missing values while web scraping.

My code as follows.


get_cast = function(movie_links)
  {
  movie_page = read_html(movie_links)
  movie_cast = movie_page %>% html_nodes(".primary_photo+ td a") %>% html_text() %>% paste(collapse=(","))
  return(movie_cast)
  
}



 movies = data.frame()

for(page_result in seq(from=1,to=101,by=50))
{
  next_page_link = paste0("https://www.imdb.com/search/title/?title_type=feature&year=2020-01-01,2020-12-31&start=",page_result,"&ref_=adv_nxt")

  next_page_infor = read_html(next_page_link)
  
  name = next_page_infor %>% html_nodes(".lister-item-header a") %>% html_text()
  year = next_page_infor %>% html_nodes(".text-muted.unbold") %>% html_text() %>% str_trim() 
  synopsis = next_page_infor %>% html_nodes(".ratings-bar+ .text-muted") %>% html_text() %>% str_trim() %>% as.character()
  rate = next_page_infor %>% html_nodes(".ratings-imdb-rating strong") %>% html_text() %>% as.numeric()


  
  # Extract Page url section and paste with domain name for entire list.
movie_links = next_page_infor %>%
  html_nodes(".lister-item-header a") %>% 
  html_attr("href") %>%
  paste0("https://www.imdb.com",., sep="") 


cast = sapply(movie_links,FUN = get_cast,USE.NAMES = FALSE)
 
  
  scraped_info = rbind(movies,data.frame(name,year,synopsis,rate,cast,stringsAsFactors = FALSE))
  print(paste0("Page No:",page_result))
   
}

Appreciate your time.

Regards
Karthik

Hi,

Thanks for proving the code, but next time don't forget the libraries (I had to google and guess)

Here is a way to fix your issue

library(tidyverse)
library(xml2)
library(rvest)

get_cast = function(movie_links)
{
  movie_page = read_html(movie_links)
  movie_cast = movie_page %>% html_nodes(".primary_photo+ td a") %>% html_text() %>% paste(collapse=(","))
  return(movie_cast)
  
}

movies = data.frame()

for(page_result in seq(from=1,to=101,by=50)){
  next_page_link = paste0("https://www.imdb.com/search/title/?title_type=feature&year=2020-01-01,2020-12-31&start=",page_result,"&ref_=adv_nxt")
  
  next_page_infor = read_html(next_page_link)
  
  name = next_page_infor %>% html_nodes(".lister-item-header a") %>% html_text()
  year = next_page_infor %>% html_nodes(".text-muted.unbold") %>% html_text() %>% str_trim() 

  #Changed
  synopsis = next_page_infor %>% html_nodes(".lister-item-content") 
  synopsis = sapply(synopsis, function(x){
    x %>%  html_nodes(".ratings-bar+ .text-muted") %>% 
      html_text() %>% str_trim() %>% as.character()
  }) %>% sapply( function(x) ifelse(length(x) == 0, NA, x))
  
  rate = next_page_infor %>% html_nodes(".lister-item-content") 
  rate = sapply(rate, function(x){
    x %>%  html_nodes(".ratings-imdb-rating strong") %>% 
      html_text() %>% as.numeric()
  }) %>% sapply( function(x) ifelse(length(x) == 0, NA, x))
  #-----
  
  # Extract Page url section and paste with domain name for entire list.
  movie_links = next_page_infor %>%
    html_nodes(".lister-item-header a") %>% 
    html_attr("href") %>%
    paste0("https://www.imdb.com",., sep="") 
  
  
  cast = sapply(movie_links,FUN = get_cast,USE.NAMES = FALSE)
  
  #replaces scraped_info with movies
  movies = rbind(movies,data.frame(name,year,synopsis,rate,cast,stringsAsFactors = FALSE))
  print(paste0("Page No:",page_result))
  
}

Basically what I did, was take the lowest common node that every movie definitely had i.e. ".lister-item-content", and extracted these as blocks over which I then looped (sapply). If some of the content was not available it would return NULL, which I'd convert into NA to make sure the total number always was 50 and thus there would be no conflicts.

Hope this helps,
PJ

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.