Good Afternoon,

I am trying to calculate the similarity between each row in a large data file (csv) with over 32000 rows. This is the code I am using:

mem.maxVSize(vsize = Inf)

mem.maxNSize(nsize = Inf)

Sys.setenv('R_MAX_VSIZE' = 40000000000000000)

Sys.setenv('R_MAX_NSIZE' = 20000000000000000)

Sys.setenv('R_MAX_MEM_SIZE' = Inf)

rm(list=ls())

gc()

library(dplyr)

row_cf <- function(x, y, df){

sum(df[x,] == df[y,])/ncol(df)

}

data<-read.csv("C:/Users/melanie/Documents/Data/KNNDataCulling.csv", header = TRUE)

data$Particle.Type<-as.factor(data$Particle.Type)

Water<-subset(data, Particle.Type=="WATER")

Water$Particle.Type<-droplevels(Water$Particle.Type)

rm(data)

gc()

Water$Corrected.Diameter.Pixels<-as.numeric(Water$Corrected.Diameter.Pixels)

Water$Contour.Slopes.Focus<-as.numeric(Water$Contour.Slopes.Focus)

Water$Center.Slopes.Focus<-as.numeric(Water$Center.Slopes.Focus)

Water$Hollowness<-as.numeric(Water$Hollowness)

Water$Ellipse.Best.Fit<-as.numeric(Water$Ellipse.Best.Fit)

Water$Ellipse.Minor.Major<-as.numeric(Water$Ellipse.Minor.Major)

Water$Ellipse.Angle<-as.numeric(Water$Ellipse.Angle)

Water$Contour.Circularity<-as.numeric(Water$Contour.Circularity)

Water$Convex.Hull.Circularity<-as.numeric(Water$Convex.Hull.Circularity)

Water$Box.H.W.Ratio<-as.numeric(Water$Box.H.W.Ratio)

Water$Angled.Box.H.W.Ratio<-as.numeric(Water$Angled.Box.H.W.Ratio)

Water <- Water[,-1]

gc()

results <- expand.grid(1:nrow(Water), 1:nrow(Water)) %>%

rename(row_1 = Var1, row_2 = Var2) %>%

rowwise() %>%

mutate(similarity = row_cf(row_1, row_2, Water))

This seems to work fine on a test data set with 100 rows or so, but when I run it on the larger data set, it runs fine for a while and then gives me the error: "no-more-error-handlers-available-recursive-errors-invoking-abort-restart". How can I avoid this from happening?