Hey guys, i am new to codeing and i wanted to use the quarantee to learn something new. I read about the interesting topic "dealing with imbalanced data" and i wanted to try it by myself. I want to use an imbalanced binary 2-class dataset and "make it better" for ML. At first i spiltted my data into training and test data. I fitted my train data with CART. Then i calculated the ConfusionMatrix for the train data to have a starting point. Then i ran techniques over my train-data like SMOTE,ROSE, over and under-sampling. Afterwards i fitted each new balanced data again using CART and recalculated the ConfusionMatrix and tried to see if the balanced accuracy changed.
Is this the correct way to handle such problem.
Here is my code. If you have a suggestions what to change, please tell me!!
setwd("C:\\Users\\loren\\Dropbox\\Uni\\Präsentation\\Datensätze")
data <- read.csv("creditcard.csv")
head(data)
glimpse(data)
prop.table(table(data$Class))
table(data$Class)
summary(data)
str(data)
# AUfteilen der Daten in Train/Test-Data
library(caret)
index <- createDataPartition(data$Class, p = 0.8, list = FALSE)
train_data <- data[index, ]
test_data <- data[-index, ]
# Verteilung der Daten
table(train_data$Class)
prop.table(table(train_data$Class))
nrow(train_data)
table(test_data$Class)
prop.table(table(test_data$Class))
nrow(test_data)
# Confusion-Matrix für Test-Data
library(rpart)
library(caret)
#install.packages("e1071")
library(e1071)
fit_train <- rpart(Class ~ ., data = train_data, method = "class", control = rpart.control(cp = 0)) # rpart = rekurisves Partitioniern
prune_train <- prune(fit_train, cp = 0.0084 )
rpart.plot(prune_train)
summary(fit_train)
rpart.plot(fit_train, extra=4)
printcp(fit_train)
plotcp(fit_train)
pred_fit_train <- predict(fit_train, newdata = test_data, type = "class")
table(test_data$Class, pred_fit_train)
# Accuarcy / Specififity / Sensititivty / Precision / Recall der Test-Daten
confusionMatrix(data = pred_fit_train ,
reference = factor(test_data$Class),
positive = "1")
# NoInfoRate < Accuarcy
# Sensitivit --> für 0 predicten
# Specificity --> für 1 predicten
# MCC
install.packages("mccr")
library(mccr)
mccr(fit_train, pred_fit_train)
# Faktorisiern der Train/Test-Daten
test_data$Class <- factor(test_data$Class)
train_data$Class <- factor(train_data$Class)
# Down-Sample
library(caret)
down_train <- downSample(x = train_data[, -ncol(train_data)],
y = train_data$Class)
table(down_train$Class)
prop.table(table(down_train$Class))
fit_down <- rpart(Class~., data = down_train, method = "class")
pred_down <- predict(fit_down, newdata = test_data, type = "class")
summary(pred_down)
# ConfusionMatrix Down-Sample
confusionMatrix(data = pred_down ,
reference = factor(test_data$Class),
positive = "1")
# Up-Sample
library(caret)
up_train <- upSample(x = train_data[, -ncol(train_data)],
y = train_data$Class)
table(up_train$Class)
prop.table(table(up_train$Class))
fit_up <- rpart(Class~., data = up_train, method = "class")
pred_up <- predict(fit_up, newdata = test_data, type = "class")
summary(pred_up)
# ConfusionMatrix Up-Sample
confusionMatrix(data = pred_up ,
reference = factor(test_data$Class),
positive = "1")
# SMOTE
install.packages("smotefamily")
library(smotefamily)
library(DMwR)
smote_train <- SMOTE(Class~., data = train_data) #punkt oder nicht?
table(smote_train$Class)
prop.table(table(smote_train$Class))
fit_smote <- rpart(Class~., data = smote_train, method = "class")
pred_smote <- predict(fit_smote, newdata = test_data, type = "class")
summary(pred_smote)
# ConfusionMatrix SMOTE
confusionMatrix(data = pred_smote ,
reference = factor(test_data$Class),
positive = "1")
# ROSE
library(ROSE)
rose_train <- ROSE(Class~., data = train_data)$data
table(rose_train$Class)
prop.table(table(rose_train$Class))
fit_rose <- rpart(Class~., data = rose_train, method = "class")
rpart.plot(fit_rose)
pred_rose <- predict(fit_rose, newdata = test_data, type = "class")
summary(pred_rose)
accuracy.meas(test_data$Class, pred_rose)
# COnfusionMatrix ROSE
confusionMatrix(data = pred_rose ,
reference = factor(test_data$Class),
positive = "1")