Sure, I'm sorry, this is the code (the cut2 is down under "IV for numeric data"):
data<-read.csv("Data_Prediction_loan.csv",header = TRUE)
data1=data #To create a backup of original data
head(data1)
#------------------------------------Basic Exploration of the data--------------------------------------------#
str(data1)
summary(data1)
dim(data1)
data1$SeniorCitizen<-as.factor(data1$SeniorCitizen)
str(data1)
#-----------------------------------Missing Value Treatment (if any)-------------------------------------------#
data.frame(colSums(is.na(data1)))
#---->Substituting missing values with mean
data1[is.na(data1$TotalCharges),19]=mean(data1$TotalCharges,na.rm=T)
data.frame(colSums(is.na(data1)))
#--------------------------------Information Value Calculation (A variable reduction technique)----------------------------------#
#-----------> Creating two data sets for numeric and categorical values
Data set with numeric variable
num <- data1[,-c(1:4,6:17)]#Numerical Data Frame
cat <- data1[,c(1:4,6:17,20)]#Categorical Data Frame
head(cat)
head(num)
str(num)
str(cat)
#---------------------------------------IV for numeric data-------------------------------------------------------#
IVCal <- function(variable,target,data,groups)
{
data[,"rank"] <- cut2(data[,variable],g=groups)
tableOutput <-sqldf(sprintf("select rank,
count(%s) n,
sum(%s) good
from data
group by rank",target,target))
tableOutput <- sqldf("select *,
(n - good) bad
from tableOutput")
tableOutput$bad_rate<- tableOutput$bad/sum(tableOutput$bad)*100
tableOutput$good_rate<- tableOutput$good/sum(tableOutput$good)*100
tableOutput$WOE<- (log(tableOutput$good_rate/tableOutput$bad_rate))100
tableOutput$IV <- (log(tableOutput$good_rate/tableOutput$bad_rate))(tableOutput$good_rate-tableOutput$bad_rate)/100
IV <- sum(tableOutput$IV[is.finite(tableOutput$IV)])
IV1 <- data.frame(cbind(variable,IV))
return(IV1)
}
a1<- IVCal("tenure","Churn",num,groups=10)
a2<- IVCal("MonthlyCharges","Churn",num,groups=10)
a3<- IVCal("TotalCharges","Churn",num,groups=10)
IV_num<- data.frame(rbind(a1,a2,a3))
IV_num
And these are the packages i installed:
list.of.packages <- c("caret", "ggplot2", "MASS", "car", "mlogit", "caTools", "sqldf"," Hmisc", "aod", "BaylorEdPsych", "ResourceSelection", "pROC", "ROCR")
Does this cut2 have something to do with the packages?