Fatal Error Occured R Session Aborted: Create a factor vector of predictions on test data

So I am trying to look at decision trees for random foresting modeling to compare against my base of logistic regression. The other day the predict function was working fine but now every time i get to line 141, R encounters fatal error and aborts session. I have redownloaded R and R studio, restarted my computer, everything I can think of.

#-----Section 01-------------------------------------------
## Load Packages
library(caret)
library(C50)
library(plyr)
library(gmodels)
library(ROCR)

##xG
#Read in events Data
# set working directory
setwd(dirname(file.choose()))
getwd()

events <- read.csv("desc.csv", stringsAsFactors = FALSE)
head(events)    # Inspect top rows of the data
str(events)

# select variables
myvars <- names(events[c(17:22)])
events <- events[myvars]
rm(myvars)
str(events)

#-----Section 02-------------------------------------------
## structure data

# BodyPart
events$bodypart <- factor(events$bodypart, levels = c("1", "2", "3"), exclude = NA,
                          labels = c("right_foot","left_foot", "header"))
summary(events$bodypart)

# Situation
events$situation <- factor(events$situation, levels = c("1", "2", "3", "4"), exclude = NA,
                           labels = c("open_play","set_piece", "corner", "free_kick"))
summary(events$situation)

# Assist Method
events$assist_method <- factor(events$assist_method, levels = c("0", "1", "2", "3", "4"), exclude = NA,
                               labels = c("no_assist","assist_pass", "assist_cross", "assist_header", "assist_throughball"))
summary(events$assist_method)

# Fast_break
events$fast_break <- factor(events$fast_break, levels = c("0", "1"), exclude = NA,
                            labels = c("no","yes"))
summary(events$fast_break)

# Is_goal
events$is_goal <- factor(events$is_goal, levels = c("0", "1"), exclude = NA,
                         labels = c("0","1"))
summary(events$is_goal)

# Location
events$location <- factor(events$location, levels = c("3","7","9","10","11","12","13","14","16"), exclude = NA,
                          labels = c("centre_box","diff_angle","left_side_box", "left_side_6ybox",
                                     "right_side_box","right_side_6ybox","close_range","penalty","long_range"))
summary(events$location)

# create dummy variables
library(dummies)
shots <- dummy.data.frame(events, names = c("location", "assist_method") , sep = ".")

colnames(shots) <- c("centre_box","diff_angle", "left_side_box", "left_side_6ybox", "right_side_box","right_side_6ybox", "close_range",
                     "penalty", "long_range","bodypart","no_assist", "assist_pass", "assist_cross", "assist_header","assist_through_ball","situation",
                     "fast_break","is_goal")

# check for missing data
apply(shots, MARGIN = 2, FUN = function(x) sum(is.na(x)))
library(Amelia)
missmap(shots, col = c("black", "grey"), legend = FALSE)
shots <- na.omit(shots)   # remove any missing data

str(events)

#-----Section 03-------------------------------------------

# train and test subsets
set.seed(12345)
shots.rand <- shots[order(runif("3812")), ]
# split into training (75%) and test (25%) data sets
shots_tr <- shots.rand[1:2859, ]
shots_te <- shots.rand[2860:3812, ]
round(prop.table(table(shots_tr$is_goal))*100,1)
round(prop.table(table(shots_te$is_goal))*100,1)



#-----Section 02-------------------------------------------
# explore the data

# look at two characteristics of the applicant
table(shots$left_side_box)
table(shots$diff_angle_left)

# look at two characteristics of the loan
summary(shots$close_range)
summary(shots$diff_angle_right)

# look at the classification variable
table(shots$is_goal)
prop.table(table(shots$is_goal))

# Turn outcome into Factor 
shots_tr$is_goal <- c('is_goal')
# Convert `day_vector` to a factor with ordered level
shots_tr$is_goal <- factor(shots_tr$is_goal, order = TRUE, levels =c('0','1'))

##Turn Factor into Character and then into Numeric 
shots_tr$is_goal <- as.numeric(as.character(shots_tr$`is_goal`))
shots_te$is_goal <- as.numeric(as.character(shots_te$`is_goal`))


# training a model on the data
# build the simplest decision tree
library(caret)
library(C50)
library(AppliedPredictiveModeling)



set.seed(12345)
shots_model <- C5.0(shots_tr[18], shots_tr$is_goal)
#  [-29] means exclude variable 17 'is_goal'

# display simple facts about the tree
shots_model

# display detailed information about the tree
summary(shots_model)

#-----Section 06-------------------------------------------

##Turn Factor into Character and then into Numeric 
shots_tr$is_goal1 <- as.numeric(as.character(shots_tr$`is_goal`))
shots_te$is_goal1 <- as.numeric(as.character(shots_te$`is_goal`))



# evaluating model performance
# create a factor vector of predictions on test data
shots_pred1 <- predict(shots_model, shots_te)

# cross tabulation of predicted versus actual classes
library(gmodels)
CrossTable(shots_te$is_goal, shots_pred1,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

# more diagnostics
library(caret)
confusionMatrix(shots_pred1, shots_te$is_goal, positive = "1")

#-----Section 07-------------------------------------------
# improving model performance

# pruning the tree to simplify and/or avoid over-fitting
?C5.0Control

set.seed(12345)
shots_prune <- C5.0(shots_te[-18], shots_tr$is_goal,
                     control = C5.0Control(minCases = 9)) # 1% training obs.
shots_prune
summary(shots_prune)
credit_prune_pred <- predict(shots_prune, shots_te)
CrossTable(shots_te$is_goal, shots_prune_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual goal', 'predicted goal'))

confusionMatrix(shots_prune_pred, shots_te$is_goal, positive = "yes")

# boosting the accuracy of decision trees
# boosted decision tree with 10 trials

set.seed(12345)
credit_boost10 <- C5.0(shots_tr[-18], shots_tr$is_goal, control = C5.0Control(minCases = 9), trials = 10)
credit_boost10
summary(credit_boost10)

shots_boost_pred10 <- predict(credit_boost10, shots_te)
CrossTable(shots_te$is_goal, shots_boost_pred10,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual is_goal', 'predicted is_goal'))

confusionMatrix(shots_boost_pred10, shots_te$is_goal, positive = "yes")

# boosted decision tree with 100 trials

set.seed(12345)
shots_boost100 <- C5.0(shots_tr[18], shots$is_goal, control = C5.0Control(minCases = 9), trials = 100)
shotst_boost100

shots_boost_pred100 <- predict(shots_boost100, shots_te)
CrossTable(shots_te$is_goal, shots_boost_pred100,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual is_goal', 'predicted is_goal'))

confusionMatrix(shots_boost_pred100, shots_te$is_goal, positive = "yes")

#-----Section 08-------------------------------------------
# evaluating using ROC curve and value of AUC
# note: only works on two-value outcomes and NOT when using a cost matrix
library(ROCR)

# prepare probability data for outcomes
shots_prob <- predict(shots_boost100, credit_test, type = "prob")
# bind with test and earlier predicted data
shots_res <- cbind(shots_te, shots_boost_pred100, shots_prob)
head(shots_res)
# create a prediction object
shots_pred <- prediction(predictions = shots_res$is_goal, labels = shots$is_goal)

# plot ROC curve
?performance()
shots_perf1 <- performance(shots_pred, measure = "tpr", x.measure = "fpr")
plot(cshots_perf1, lwd = 2)
abline(a = 0, b = 1, lty = 2)

# calculate the area under the curve (AUC)
shots_perf2 <- performance(shots_pred,  measure ="auc")
shots_perf2@y.values

# remove all variables from the environment

rm(list=ls())

A copy of the error message would be helpful.

Also, I'd suggest offering as close to a reprex as possible in setting up your question. With a minimal REPRoducible EXample (reprex) it makes it much easier for others to understand your issue and figure out how to help.
For example, currently the code refers to data folks won't have access to.

If you're unable to recreate your error with dummy data, and you're okay with the public seeing desc.csv, rstudio.cloud might be a good place to host a reprex.

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.