R question
Something (dataset?) is not being recognized by R. Output includes NaN and NA.
When running lines one by one, errors occur at print statements. see below.
errors:
print(mean_cv_accuracy) #NaN
print(sd_cv_accuracy) #NA
How to fix?
TASK:
cross-validate a proportional odds logistic regression (polr) object without using the caret package,
use base R functions, the 'MASS' package and polr.
STEP-BY-STEP GUIDE
perform k-fold cross-validation:
SOURCE CODE
# DATA IMPORT
train <- read.csv(file="C:\\data\\train.csv",header=TRUE,sep=",")
test <- read.csv(file="C:\\data\\test.csv", header=TRUE,sep=",")
#names(train)
#cat(" ","\n")
#names(test)
# DATA WRANGLING
# change the quality column to a factor type
train$quality <- as.factor(train$quality)
test$quality <- as.factor(test$quality)
attach(train)
attach(test)
library(MASS)
# Model definition -density
model <- as.formula(quality ~ type + fixed.acidity + volatile.acidity +
citric.acid + residual.sugar + chlorides +
free.sulfur.dioxide + total.sulfur.dioxide +
pH + sulphates + alcohol)
# prepare for future cross-validation function
your_formula <- model
k <- 10
your_data <- train
val_data <- test
quality <- train$quality
your_response_variable <- quality
set.seed(12345) # Set seed for reproducibility
folds <- sample(1:k, nrow(your_data), replace = TRUE)
cv_accuracy <- numeric(k)
for (i in 1:k) {
# Split the data into training and validation sets
train_data <- your_data[folds != i,]
val_data <- your_data[folds == i,]
print(i)
# Fit the polr model to the training data
model <- polr(your_formula, data = train_data, Hess = TRUE)
# Make predictions on the validation set
predictions <- predict(model, newdata = val_data)
# Calculate accuracy for this fold
fold_accuracy <- mean(predictions == val_data$your_response_variable)
# Store the accuracy in the cv_accuracy vector
cv_accuracy[i] <- fold_accuracy
mean_cv_accuracy <- mean(cv_accuracy)
sd_cv_accuracy <- sd(cv_accuracy)
print(mean_cv_accuracy) #NaN
print(sd_cv_accuracy) #NA
}
OUTPUT
[1] NaN
[1] NA
[1] 2
[1] NaN
[1] NA
[1] 3
[1] NaN
[1] NA
[1] 4
[1] NaN
[1] NA
[1] 5
[1] NaN
[1] NA
[1] 6
[1] NaN
[1] NA
[1] 7
[1] NaN
[1] NA
[1] 8
[1] NaN
[1] NA
[1] 9
[1] NaN
[1] NA
[1] 10
[1] NaN
[1] NA
I made a few changes and am getting a new error? Fold accuracy computation is giving an error.
How to fix?
# DATA IMPORT
train <- read.csv(file="C:\\data\\train.csv",header=TRUE,sep=",")
names(train)
library(MASS)
# Model definition -density
model <- as.formula(as.factor(quality) ~ type + fixed.acidity + volatile.acidity +
citric.acid + residual.sugar + chlorides +
free.sulfur.dioxide + total.sulfur.dioxide +
pH + sulphates + alcohol)
# prepare for future cross-validation function
your_formula <- model
k <- 2
your_data <- train
quality <- train$quality
your_response_variable <- quality
set.seed(12345) # Set seed for reproducibility
folds <- sample(1:k, nrow(your_data), replace = TRUE)
cv_accuracy <- numeric(k)
for (i in 1:k) {
# Split the data into training and validation sets
train_data <- your_data[folds != i,]
#val_data <- your_data[folds == i, -ncol(your_data)]
#str(val_data) #quality is not included
val_data <- your_data[folds == i,] #quality is included
# Fit the polr model to the training data
model <- polr(your_formula, data = train_data, Hess = TRUE)
# Make predictions on the validation set
#predictions <- predict(model, newdata = val_data, type="class")
#predictions <- predict(model, newdata = val_data, type="probs")
predictions <- predict(model, newdata = val_data)
# Calculate accuracy for this fold
fold_accuracy <- mean(predictions == your_response_variable[val_data])
#Error in your_response_variable[val_data] : invalid subscript type 'list' <-------------------
# Store the accuracy in the cv_accuracy vector
cv_accuracy[i] <- fold_accuracy
}
mean_cv_accuracy <- mean(cv_accuracy)
sd_cv_accuracy <- sd(cv_accuracy)
print(mean_cv_accuracy)
print(sd_cv_accuracy)