I am trying to create an algorithm for height dataset in dslabs . Actually i am a bit confused what value should i give in n . My main code is this
library(dslabs)
library(tidyverse)
library(caret)
library(e1071)
library(plotROC)
library(pROC)
# Making an algorithm for the height dataset
data(heights)
head(heights)
set.seed(2007)
y <- heights$sex
x <- heights$height
# initiallised the testing and training set
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
test_set <- heights[test_index, ]
train_set <- heights[-test_index, ]
# For accurate algorithm
cutoff <- seq(61, 70)
accuracy <- map_dbl(cutoff, function(x){
y_hat_accuracy <- ifelse(train_set$height > x, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
mean(y_hat_accuracy == train_set$sex)
})
# For algorithm with good F1score
cutoff <- seq(61, 70)
F_1 <- map_dbl(cutoff, function(x){
y_hat_F1score <- ifelse(train_set$height > x, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
F_meas(data = y_hat_F1score, reference = factor(train_set$sex))
})
best_cutoff_accuracy <- cutoff[which.max(accuracy)]
best_cutoff_F1score <- cutoff[which.max(F_1)]
#table for confusion matrix values
table(predicted = y_hat, actual = test_set$sex)
#confusion matrix
cm <- confusionMatrix(data = y_hat, reference = test_set$sex)
cm
cm$overall["Accuracy"]
cm$byClass[c("Sensitivity","Specificity", "Prevalence")]
# For algorithm with high accuracy and best cutoff value
y_hat_accuracy <- ifelse(test_set$height > best_cutoff_accuracy, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
# For algorithm with high F1 score and best cutoff value
y_hat_F1score <- ifelse(test_set$height > best_cutoff_F1score, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
sensitivity(data = y_hat, reference = test_set$sex)
specificity(data = y_hat, reference = test_set$sex)
# For drawing ROC curve b/w FPR and TPR
probs <- seq(0, 1, length.out = 10)
guessing <- map_df(probs, function(p){
y_hat_accuracy <-
sample(c("Male", "Female"),length(test_set), replace = TRUE, prob=c(p, 1-p)) %>%
factor(levels = c("Female", "Male"))
list(method = "Guessing",
FPR = 1 - specificity(y_hat, test_set$sex),
TPR = sensitivity(y_hat, test_set$sex))
})
plot(guessing)
THANKS !