Hello Friends.
I have an Script to codify txt files, Basically It changes strings into numerical data under conditions.
It Worked, but I have some large datasets (>6GB - 300000 Thousands rows) and with this big data frame my computer can not execute.
Is there a way to make this code run with large dataset?
{
memory.limit (9999999999)
library(data.table)
library(stringr)
setDTthreads(0)
library(readr)
Hybrids <- fread("Bigfile4.txt", header = TRUE, sep = "\t")
{
genecode.AT <-function(x){
codelist <- factor(x,levels = c("A/A","T/T","A/T","T/A","N/T","T/N","N/A","A/N"),
labels = c(-1,1,0,0,0.5,0.5,-0.5,-0.5))
return(codelist)
}
genecode.CG <-function(x){
codelist <- factor(x,levels = c("C/C","G/G","C/G","G/C","N/G","G/N","N/C","C/N"),
labels = c( 1,-1,0,0,-0.5,-0.5,0.5,0.5))
return(codelist)
}
genecode.common <-function(x){
codelist <- factor(x,levels = c("A/A","T/T","C/C","G/G","T/C","C/T","T/G","G/T","A/T","T/A","A/C","C/A","G/A","A/G","G/C","C/G","N/T","T/N","N/A","A/N","G/N","N/G","C/N","N/C"),
labels = c(1,1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0.5,0.5,0.5,0.5,-0.5,-0.5,-0.5,-0.5))
return(codelist)
}
genecode.dominant <-function(x){
codelist <- factor(x,levels = c("A/A","T/T","C/C","G/G","T/C","C/T","T/G","G/T","A/T","T/A","A/C","C/A","G/A","A/G","G/C","C/G","N/T","T/N","N/A","A/N","G/N","N/G","C/N","N/C"),
labels = c(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5))
return(codelist)
}
}
################################################################################
#Coding Additive Values based on Hybrid Genotypes
################################################################################
{
#Extract tag name of Markers
Markers <- colnames(Hybrids)[-1]
gene <- as.data.table(Hybrids[ ,-1])
#Locate the markers containing the alleles A, T, C, G
gene_list <- apply(gene, 2, unique)
gene_list <- lapply(gene_list,paste,collapse ="")
markers_A <- str_detect(gene_list,"A")
markers_T <- str_detect(gene_list,"T")
markers_C <- str_detect(gene_list,"C")
markers_G <- str_detect(gene_list,"G")
rm(gene_list)
gc() #Clean the memory after removing objects
#Locate makers with alleles A&T and C&G
AT <- (markers_A | markers_T) & !(markers_C | markers_G)
CG <- (markers_C | markers_G) & !(markers_A | markers_T)
gene.AT <- gene[ , AT, with = F]
gene.CG <- gene[ , CG, with = F]
common <- setdiff(Markers, c(colnames(gene.AT), colnames(gene.CG)))
gene.common <- gene[ , common, with = F]
rm(gene)
gc()
#Create a column vector as.data.frame with the Hybrid names in the column
Hybrid.names <- data.table(Hybrids[ , 1])
#Create individual data.table for each allele combination
gene.ATadd <- apply(gene.AT, 2, genecode.AT)
gene.ATadd <- as.data.table(gene.ATadd)
gene.CGadd <- apply(gene.CG, 2, genecode.CG)
gene.CGadd <- as.data.table(gene.CGadd)
gene.commonadd <- apply(gene.common, 2, genecode.common)
gene.commonadd <- as.data.table(gene.commonadd)
#Merge all of the data.tables created with all alleles combination.
#Look that the order of the markers are different from the first Hybrid object created.
Add.code <- data.table(gene.ATadd,gene.CGadd,gene.commonadd)
Add.code <- as.data.table(lapply(Add.code, as.numeric))
rm(gene.ATadd)
rm(gene.CGadd)
rm(gene.commonadd)
gc()
fillna <- function(x){
x[is.na(x)] <- round(mean(x,na.rm = T),digits = 4)
return(x)
}
Add.code <- (apply(Add.code, 2, fillna))
#Add hybrid names to the table
Add.code <- data.table(Hybrid.names, Add.code)
}
{
fwrite(Add.code, "Additive_coded.txt", sep="\t")
rm(Add.code)
gc()
}
################################################################################
#Code Dominant values
################################################################################
#Create individual data.table for each allele combination
{
gene.ATdom <- apply(gene.AT, 2, genecode.dominant)
gene.ATdom <- as.data.table(gene.ATdom)
gene.CGdom <- apply(gene.CG, 2, genecode.dominant)
gene.CGdom <- as.data.table(gene.CGdom)
gene.commondom <- apply(gene.common, 2, genecode.dominant)
gene.commondom <- as.data.table(gene.commondom)
#Merge all of the data.tables created with all alleles combination.
#Look that the order of the markers are different from the first Hybrid object created.
Dom.code <- data.table(gene.ATdom,gene.CGdom,gene.commondom)
Dom.code <- as.data.table(lapply(Dom.code, as.numeric))
rm(gene.ATdom)
rm(gene.CGdom)
rm(gene.commondom)
rm(gene.AT)
rm(gene.CG)
rm(gene.common)
gc()
Dom.code <- (apply(Dom.code, 2, fillna))
#Add hybrid names to the table
Dom.code <- data.table(Hybrid.names, Dom.code)
}
################################################################################
#Save coded files
{
fwrite(Dom.code, "Dominant_coded.txt", sep="\t")
rm(Dom.code)
}
}