i wrote this code and it worked got the required output but i saved it to go get a coffee and tried opening it again afterwards and it would display error on the last 2 lines every time ... can some one help ? im uploading the code and the error picture below.
# Netflix Data Kaggle
# Importing Libraries
library(tidyverse)
library(lubridate)
library(naniar)
library(skimr)
# loading the dataset
netflix <- read_csv("c://users/kumarappan M/documents/netflix_titles.csv")
tibble(netflix)
# Summary of the dataset
skim_without_charts(netflix)
# Missing values
vis_miss(netflix)
gg_miss_fct( x=netflix , fct = type ) + labs ( title = " Missing Variables by type" , x = "Type")
## As you can see there are way too many missing values in the fields director , cast and country . i will create 2 data frames one to use later for analysis on directors and the other to drop the column director and cast as they contain too many missing values and its impossible to proxy the data. i will also replace the missing values in the country,date_added and rating column's by substituting the missing values with their respective modes.
# Create the function mode as R doesnt have a pre set function to calculate mode then use the `ifelse()` to change the NA's to mode.
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
netflix$country <- ifelse( is.na(netflix$country),
getmode(netflix$country),
netflix$country)
netflix$date_added <- ifelse( is.na(netflix$date_added) , getmode(netflix$date_added) , netflix$date_added )
netflix$rating <- ifelse( is.na(netflix$rating) , getmode(netflix$rating) , netflix$rating)
netflix <- netflix %>% select( -c(description,cast,director,listed_in))
skim_without_charts(netflix)
# All Na's have been cleaned or replaced in the data set ^^^
#### Data cleaning and manipulation for analysis
# Some countries have multiple values and needs to be separated to just the first country, the main country of production.
netflix <- separate( netflix , country , into = c( "production_country") , sep = ",")
skim_without_charts(netflix)
# create a month and year added column
netflix <- netflix %>% separate ( date_added , into = c("month_added" , "year_added" ) , sep=", ")
netflix <- netflix %>% separate ( month_added , into = c("month_added" ) , sep= " ")
# The rating column can be segregated into ages , to anlyse the values by age category. i used this link <https://www.spectrum.net/support/tv/tv-and-movie-ratings-descriptions/> to get info on rating categories.
netflix$rating <- gsub ( "TV-MA" , "Adult" , netflix$rating)
netflix$rating <- gsub ( "TV-PG" , "Adult" , netflix$rating)
netflix$rating <- gsub ( "TV-14" , "Adult" , netflix$rating)
netflix$rating <- gsub ( "UR" , "Adult" , netflix$rating)
netflix$rating <- gsub ( "R" , "Adult" , netflix$rating)
netflix$rating <- gsub ( "NR" , "Adult" , netflix$rating)
netflix$rating <- gsub ( "NC-17" , "Adult" , netflix$rating)
netflix$rating <- gsub ( "TV-Y" , "children" , netflix$rating)
netflix$rating <- gsub ( "TV-Y7" , "Older kids" , netflix$rating)
netflix$rating <- gsub ( "TV-Y7-FV" , "Older kids" , netflix$rating)
netflix$rating <- gsub ( "PG" , "Older kids" , netflix$rating)
netflix$rating <- gsub ( "PG-13" , "Teens" , netflix$rating)
netflix$rating <- gsub ( "G" , "General audience" , netflix$rating)
netflix$rating <- gsub ( "TV-G" , "General audience" , netflix$rating)
netflix$rating <- gsub ( "Older kids-13" , "Older kids" , netflix$rating)
netflix$rating <- gsub ( "General audienceeneral audience" , "General audience" , netflix$rating)
netflix$rating <- gsub ( "NAdult" , "Adult" , netflix$rating)
netflix$rating <- gsub ( "children7" , "children" , netflix$rating)
# split the data into 2 sets one with just movies and one with just tv shows
netflix_shows <- netflix %>% filter(type == "TV Show")
netflix_movies <- netflix %>% filter(type == "Movie")
# Visualisations
# 1.Distribution by content
pie_1 <- netflix %>% group_by(type) %>% summarise(total=n()) %>% mutate(perc_pie=round( 100*pie_1$total/sum(pie_1$total)))
pie(pie_1$total , paste0(pie_1$perc_pie,"%") , main = "Segregation by type" , col = rainbow(length(pie_1)))
legend("topright", c("Movie" , "TV show"), cex = 0.8,
fill = rainbow(length(pie_1)))
Error: