I have this dataset in R that looks something like this:
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
my_data = data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
Using the R programming language, I wrote this loop that calculates conditional probabilities for the next exam results of a student conditional on the previous exam:
library(data.table)
setDT(my_data)
my_list = vector("list", length(unique(my_data$id)))
# Create an empty vector with pre-specified dimensions
my_vector = vector("list", 100)
for (i in 1:length(unique(my_data$id)))
{
tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.table(table(pairs_i))
frame_i[, id := i]
print(frame_i)
my_vector[[i]] = frame_i
}, error = function(e){})
}
final = rbindlist(my_vector)
I am now trying to "vectorize" this code for improved efficiency. Here is my attempt:
# I don't think I need to create a "list or vector" to store the results in advance?
my_vector = sapply(unique(my_data$id), function(i) {
{tryCatch({
setDT(my_data)
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$i = i
print(frame_i)
return(frame_i)
}, error = function(e){print(paste("An error occurred:", e))})
}
})
# produced an error, so I tried a different code
final = rbindlist(my_vector, fill = TRUE)
# not sure if this fully worked either?
final = do.call(rbind.data.frame, my_vector)
Have I correctly "vectorized" this code?
Thanks!