Web Scraping with Missing Values

Hi,

Thanks for proving the code, but next time don't forget the libraries (I had to google and guess)

Here is a way to fix your issue

library(tidyverse)
library(xml2)
library(rvest)

get_cast = function(movie_links)
{
  movie_page = read_html(movie_links)
  movie_cast = movie_page %>% html_nodes(".primary_photo+ td a") %>% html_text() %>% paste(collapse=(","))
  return(movie_cast)
  
}

movies = data.frame()

for(page_result in seq(from=1,to=101,by=50)){
  next_page_link = paste0("https://www.imdb.com/search/title/?title_type=feature&year=2020-01-01,2020-12-31&start=",page_result,"&ref_=adv_nxt")
  
  next_page_infor = read_html(next_page_link)
  
  name = next_page_infor %>% html_nodes(".lister-item-header a") %>% html_text()
  year = next_page_infor %>% html_nodes(".text-muted.unbold") %>% html_text() %>% str_trim() 

  #Changed
  synopsis = next_page_infor %>% html_nodes(".lister-item-content") 
  synopsis = sapply(synopsis, function(x){
    x %>%  html_nodes(".ratings-bar+ .text-muted") %>% 
      html_text() %>% str_trim() %>% as.character()
  }) %>% sapply( function(x) ifelse(length(x) == 0, NA, x))
  
  rate = next_page_infor %>% html_nodes(".lister-item-content") 
  rate = sapply(rate, function(x){
    x %>%  html_nodes(".ratings-imdb-rating strong") %>% 
      html_text() %>% as.numeric()
  }) %>% sapply( function(x) ifelse(length(x) == 0, NA, x))
  #-----
  
  # Extract Page url section and paste with domain name for entire list.
  movie_links = next_page_infor %>%
    html_nodes(".lister-item-header a") %>% 
    html_attr("href") %>%
    paste0("https://www.imdb.com",., sep="") 
  
  
  cast = sapply(movie_links,FUN = get_cast,USE.NAMES = FALSE)
  
  #replaces scraped_info with movies
  movies = rbind(movies,data.frame(name,year,synopsis,rate,cast,stringsAsFactors = FALSE))
  print(paste0("Page No:",page_result))
  
}

Basically what I did, was take the lowest common node that every movie definitely had i.e. ".lister-item-content", and extracted these as blocks over which I then looped (sapply). If some of the content was not available it would return NULL, which I'd convert into NA to make sure the total number always was 50 and thus there would be no conflicts.

Hope this helps,
PJ