Running code with large dataset (RAM memory not enough)

Hello Friends.

I have an Script to codify txt files, Basically It changes strings into numerical data under conditions.
It Worked, but I have some large datasets (>6GB - 300000 Thousands rows) and with this big data frame my computer can not execute.

Is there a way to make this code run with large dataset?

{
memory.limit (9999999999)

library(data.table)
library(stringr)
setDTthreads(0)
library(readr)

Hybrids <- fread("Bigfile4.txt", header = TRUE, sep = "\t")

{
genecode.AT <-function(x){
codelist <- factor(x,levels = c("A/A","T/T","A/T","T/A","N/T","T/N","N/A","A/N"),
labels = c(-1,1,0,0,0.5,0.5,-0.5,-0.5))
return(codelist)
}
genecode.CG <-function(x){
codelist <- factor(x,levels = c("C/C","G/G","C/G","G/C","N/G","G/N","N/C","C/N"),
labels = c( 1,-1,0,0,-0.5,-0.5,0.5,0.5))
return(codelist)
}
genecode.common <-function(x){
codelist <- factor(x,levels = c("A/A","T/T","C/C","G/G","T/C","C/T","T/G","G/T","A/T","T/A","A/C","C/A","G/A","A/G","G/C","C/G","N/T","T/N","N/A","A/N","G/N","N/G","C/N","N/C"),
labels = c(1,1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0.5,0.5,0.5,0.5,-0.5,-0.5,-0.5,-0.5))
return(codelist)
}
genecode.dominant <-function(x){
codelist <- factor(x,levels = c("A/A","T/T","C/C","G/G","T/C","C/T","T/G","G/T","A/T","T/A","A/C","C/A","G/A","A/G","G/C","C/G","N/T","T/N","N/A","A/N","G/N","N/G","C/N","N/C"),
labels = c(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5))
return(codelist)
}
}
################################################################################
#Coding Additive Values based on Hybrid Genotypes
################################################################################
{
#Extract tag name of Markers
Markers <- colnames(Hybrids)[-1]

gene <- as.data.table(Hybrids[ ,-1])

#Locate the markers containing the alleles A, T, C, G
gene_list <- apply(gene, 2, unique)
gene_list <- lapply(gene_list,paste,collapse ="")
markers_A <- str_detect(gene_list,"A")
markers_T <- str_detect(gene_list,"T")
markers_C <- str_detect(gene_list,"C")
markers_G <- str_detect(gene_list,"G")

rm(gene_list)
gc() #Clean the memory after removing objects

#Locate makers with alleles A&T and C&G
AT <- (markers_A | markers_T) & !(markers_C | markers_G)
CG <- (markers_C | markers_G) & !(markers_A | markers_T)

gene.AT <- gene[ , AT, with = F]
gene.CG <- gene[ , CG, with = F]
common <- setdiff(Markers, c(colnames(gene.AT), colnames(gene.CG)))
gene.common <- gene[ , common, with = F]

rm(gene)
gc()

#Create a column vector as.data.frame with the Hybrid names in the column
Hybrid.names <- data.table(Hybrids[ , 1])

#Create individual data.table for each allele combination
gene.ATadd <- apply(gene.AT, 2, genecode.AT)
gene.ATadd <- as.data.table(gene.ATadd)
gene.CGadd <- apply(gene.CG, 2, genecode.CG)
gene.CGadd <- as.data.table(gene.CGadd)
gene.commonadd <- apply(gene.common, 2, genecode.common)
gene.commonadd <- as.data.table(gene.commonadd)

#Merge all of the data.tables created with all alleles combination.
#Look that the order of the markers are different from the first Hybrid object created.
Add.code <- data.table(gene.ATadd,gene.CGadd,gene.commonadd)

Add.code <- as.data.table(lapply(Add.code, as.numeric))
rm(gene.ATadd)
rm(gene.CGadd)
rm(gene.commonadd)
gc()

fillna <- function(x){
x[is.na(x)] <- round(mean(x,na.rm = T),digits = 4)
return(x)
}

Add.code <- (apply(Add.code, 2, fillna))

#Add hybrid names to the table
Add.code <- data.table(Hybrid.names, Add.code)
}
{

fwrite(Add.code, "Additive_coded.txt", sep="\t")

rm(Add.code)
gc()
}

################################################################################
#Code Dominant values
################################################################################
#Create individual data.table for each allele combination
{
gene.ATdom <- apply(gene.AT, 2, genecode.dominant)
gene.ATdom <- as.data.table(gene.ATdom)
gene.CGdom <- apply(gene.CG, 2, genecode.dominant)
gene.CGdom <- as.data.table(gene.CGdom)
gene.commondom <- apply(gene.common, 2, genecode.dominant)
gene.commondom <- as.data.table(gene.commondom)

#Merge all of the data.tables created with all alleles combination.
#Look that the order of the markers are different from the first Hybrid object created.
Dom.code <- data.table(gene.ATdom,gene.CGdom,gene.commondom)

Dom.code <- as.data.table(lapply(Dom.code, as.numeric))
rm(gene.ATdom)
rm(gene.CGdom)
rm(gene.commondom)
rm(gene.AT)
rm(gene.CG)
rm(gene.common)
gc()

Dom.code <- (apply(Dom.code, 2, fillna))

#Add hybrid names to the table
Dom.code <- data.table(Hybrid.names, Dom.code)
}
################################################################################
#Save coded files
{

fwrite(Dom.code, "Dominant_coded.txt", sep="\t")

rm(Dom.code)
}
}

Hi,

You might want to look into packages like disk.frame that allow you to handle datasets too large for RAM.

Hope this helps,
PJ

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.