Store time series data to big matrix

Hi,

I am trying to run a script that stores time series vegetation and reanalysis data based on geographic points in a big matrix. But I have two problems: (1) the code fails at some point (when running the for loop at row 77920) and (2) a reprex of the code displays an error message.

The code successfully works over a smaller study area but I get the following error message when applied over a larger area: Error in SetRows.bm(x, i, value) :
number of items to replace is not a multiple of replacement length

A reprex of the code displays the following error message: Rendering reprex...
Error: This reprex appears to crash R
Call reprex() again with std_out_err = TRUE to get more info

Herewith is the code, it's a bit long...


###This script  stores time series vegetation and reanalysis data based on geographic points in a big matrix
#ts.evi is matrix that contains 19-year vegetation data
#tile.xy are geographic/spatial points of length 705184 (dimension 705184 by 457)
#ts.moist1 is a matrix that contains 19-year moisture data at each geographic point (dimensions 705184 by 240)
#ts.moist2 is a matrix that contains 19-year moisture data at each geographic point, but different to ts.moist2 (dimensions 705184 by 240)
#ts.srad is a matrix that contains 19-year solar radiation data at each geographic point (dimensions 705184 by 240)
#ts.tair is a matrix that contains 19-year air temperature data at each geographic point (dimensions 705184 by 240)
#ts.tsoil is a matrix that contains 19-year soil temperature data at each geographic point (dimensions 705184 by 240)
#The vegetation data is available every 16 days of the time series while the reanalysis data is available on a monthly basis


library(reprex)
library(raster)
library(ncdf4)
library(bigmemory)
library(foreach)
library(doMC)
library(doParallel)
library(raster)
library(ncdf4)
library(bigmemory)
library(rgdal)
library(sp)

cores<-12
registerDoMC(cores=cores)


source("/home/muhoko/Biomes_of_Southern_Africa/Tests/Kruger/dopixel_2020_EVI_kruger_negative1_removed.R")#Code used to run dopixel in the for loop

#--open the ERA file extract time
    setwd("/home/fbiome/Attribution")
    ncERA = nc_open("TTRforceTime.nc")
    lon <- ncvar_get(ncERA, "longitude")
    lat <- ncvar_get(ncERA, "latitude", verbose = F)
    t <- ncvar_get(ncERA, "time")
#--put dates in R date format
    ERA.dates <- as.Date(t/24, origin = '1900-01-01', tz="UTC") #240 dates
    
    #load saved objects
    
    ts.evi <- readRDS("/home/muhoko/Biomes_of_Southern_Africa/Protected_areas/Protected_areas_NDVI_EVI/cropped/export/ts.evi.rds")
    tile.xy <- readRDS("~/Biomes_of_Southern_Africa/Protected_areas/Protected_areas_NDVI_EVI/Phenological metrics of Southern Africa/Saved_objects/Extracted_data/tile.xy.rds")
    ts.moist1 <- readRDS("~/Biomes_of_Southern_Africa/Protected_areas/Protected_areas_NDVI_EVI/Phenological metrics of Southern Africa/Saved_objects/Extracted_data/ts.moist1.rds")
    ts.moist2 <- readRDS("~/Biomes_of_Southern_Africa/Protected_areas/Protected_areas_NDVI_EVI/Phenological metrics of Southern Africa/Saved_objects/Extracted_data/ts.moist2.rds")
    ts.srad <- readRDS("~/Biomes_of_Southern_Africa/Protected_areas/Protected_areas_NDVI_EVI/Phenological metrics of Southern Africa/Saved_objects/Extracted_data/ts.srad.rds")
    ts.tair <- readRDS("~/Biomes_of_Southern_Africa/Protected_areas/Protected_areas_NDVI_EVI/Phenological metrics of Southern Africa/Saved_objects/Extracted_data/ts.tair.rds")
    ts.tsoil <- readRDS("~/Biomes_of_Southern_Africa/Protected_areas/Protected_areas_NDVI_EVI/Phenological metrics of Southern Africa/Saved_objects/Extracted_data/ts.tsoil.rds")
    
  #------------------------------------------------------------------#
#--get date information from the MODIS time series-----------------#
#------------------------------------------------------------------#
# get the layer names provided by the call to extract evi:
    layer.names<-labels(ts.evi)[[2]]   
# take the substring on these names, which give the dates:
    date.text <-substr(layer.names,start=10,stop=16)
    date.pos <- strptime(date.text,  format = "%Y%j", tz="UTC") #convert to R date format
    
    #------------------------------------------------------------------#
#--create big matrix to store output of do_pixel-------------------#
#------------------------------------------------------------------#
    pixel.date <- as.Date(seq.POSIXt(ISOdate(2000,6,1),by="week",length.out=1020,tz="UTC"))
    
    YEAR <- as.numeric(format(pixel.date, "%Y"))
    YITS = length(unique(YEAR))
#--the column names for the big matrix
    names<-c("lon","lat",
             "lq", "uq", "mean.evi", "sd.evi", "sum.evi", "amplitude",
            "peak.day", "trough.day",
            paste("td", 1:YITS, sep="."), paste("td.x", 1:YITS, sep="."), paste("td.evi", 1:YITS, sep="."),
            paste("pd", 1:YITS, sep="."), paste("pd.x", 1:YITS, sep="."), paste("pd.evi", 1:YITS, sep="."),
            paste("elon.m", 1:YITS, sep="."), paste("elon.m.x", 1:YITS, sep="."),
            paste("eoff.m", 1:YITS, sep="."), paste("eoff.m.x", 1:YITS, sep="."),
            paste("elon.f", 1:YITS, sep="."), paste("elon.f.x", 1:YITS, sep="."),
            paste("eoff.f", 1:YITS, sep="."), paste("eoff.f.x", 1:YITS, sep="."),
            paste("elon.l", 1:YITS, sep="."), paste("elon.l.x", 1:YITS, sep="."),
            paste("eoff.l", 1:YITS, sep="."), paste("eoff.l.x", 1:YITS, sep="."),
            paste("sum.evi.yr", 1:YITS, sep="."), paste("amp", 1:YITS, sep="."),
            paste("gsl", 1:YITS, sep="."), paste("gsl.peak", 1:YITS, sep="."), paste("gsl.long", 1:YITS, sep="."),
            paste("elon.f.evi", 1:YITS, sep="."), paste("elon.m.evi", 1:YITS, sep="."), paste("elon.l.evi", 1:YITS, sep="."),
            paste("eoff.f.evi", 1:YITS, sep="."), paste("eoff.m.evi", 1:YITS, sep="."), paste("eoff.l.evi", 1:YITS, sep="."),
            paste("tair.rank.td", 1:YITS, sep="."),paste("tair.rank.pd", 1:YITS, sep="."),
            paste("moist1.rank.td", 1:YITS, sep="."),paste("moist1.rank.pd", 1:YITS, sep=".")
    ) 
#--create the matrix using pixel.df to specifiy rows
#--and column names to speciify columns
    statsMODIS<-big.matrix(nrow=length(tile.xy), ncol=length(names), type="double",
                  dimnames=list(row.names=NULL, col.names=names),
                  backingfile="statsMODIS.bin", descriptorfile="statsMODIS.desc")
                  
                  
    #------------------------------------------------------------------#
#--start a for loop that will call do_pixel------------------------#
#--this will be slow, but its useful for debugging-----------------#
#------------------------------------------------------------------#
    ERA.dates <- ERA.dates+14
    pixel.df = data.frame(date = pixel.date)
    for(p in 1:dim(ts.evi)[1]) #for all pixels in ts.evi, this will be slow!
    {
    print(p)
#-0-only call dopixel if there is sufficient evi data in the pixel.
    if( length(which(is.na(ts.evi[p,])))  < 100 ) # there are circa 440 data points
        {
#-1-create a data.frame for the pixel
        pixel.df = data.frame(date = pixel.date)
#-2-create an approx function for each ERA variable
        af.tair = approxfun(ERA.dates,ts.tair[1,])
        af.tsoil = approxfun(ERA.dates,ts.tsoil[1,])
        af.srad = approxfun(ERA.dates,ts.srad[1,])
        af.moist1 = approxfun(ERA.dates,ts.moist1[1,])
        af.moist2 = approxfun(ERA.dates,ts.moist2[1,])
#-3-create an approx function for the EVI data
        af.evi = approxfun(as.Date(date.pos),ts.evi[p,])
#-4-use the approxfun's to fill the data frame data.frame
pixel.df$tair <- af.tair(as.Date(pixel.date))
pixel.df$tsoil <- af.tsoil(as.Date(pixel.date))
pixel.df$srad <- af.srad(as.Date(pixel.date))
pixel.df$moist1 <- af.moist1(as.Date(pixel.date))
pixel.df$moist2 <- af.moist2(as.Date(pixel.date))
pixel.df$evi <- af.evi(as.Date(pixel.date))
pixel.df<-pixel.df[complete.cases(pixel.df),]
#head(pixel.df)
coords<-round(coordinates(tile.xy)[p,],2)
statsMODIS[p,] <- dopixel(pixel.df,coords,PLOT=FALSE)
} else 
{
  coords<-round(coordinates(tile.xy)[p,],2)
  statsMODIS[p,] <- c(coords,rep(-9999,length=length(names)-2))
}
}

'''

Your code uses data we do not have and also the functions in dopixel_2020_EVI_kruger_negative1_removed.R that we cannot access. That makes it very difficult to give specific advice.
If you are sure the error is in the for loop, I suggest you comment out the line that uses dopixel(). If the code runs like that, then you know where the problem arises. Is SetRows.bm a function in the dopixel_2020_EVI_kruger_negative1_removed.R code?
If removing dopixel from the loop does not cure the error, then I would further simplify the for loop until you know where it comes from. There are only a few different functions in the loop, so it should not take many iterations to narrow down the possible sources of the problem. You can then present a much more focused problem.

Unfortunately trying to get a reprex of the code displays an error message. I am therefore unable to reproduce the data or the functions in dopixel_2020_EVI_kruger_negative1_removed.R.. Running the code 'statsMODIS[p,] <- dopixel(pixel.df,coords,PLOT=FALSE)' reproduces the same error message, I'm therefore sure that this is where the problem lies.

SetRows.bm is not a function in the codedopixel_2020_EVI_kruger_negative1_removed.R. Removing dopixel cures the error message. The error message implies that I'm dealing with items of different lengths but can't explicitly tell where the problem lies.

is likely the pain point, especially since its being looped over without a pre-allocated receiver object. See R for Data Science:

Before you start the loop, you must always allocate sufficient space for the output. This is very important for efficiency: if you grow the for loop at each iteration using c() (for example), your for loop will be very slow.

When you say

ts.moist1 is a matrix that contains 19-year moisture data at each geographic point (dimensions 705184 by 240)

how many points (long/lat pairs) are involved?

Thanks, I will visit the link you provided. To answer your question, there are 705184 spatial points.

Sorry. You have 13,398,496 observations of each variable?

For each long/lat (705184 ), there are 457 vegetation data observations over a 19-year period. While there are 240 reanalysis observations for each of the long/lat (705184) points over a 19 year period.

Don't loop, use data.table instead (see data.table Cheatsheet for example https://s3.amazonaws.com/assets.datacamp.com/img/blog/data+table+cheat+sheet.pdf )

Again, sorry. I am really having a hard time wrapping my head around this. I'm imagining a spatial polygon object containing spatial points (long/lat). Let it be the San Gabriel Mountain Range above Los Angeles, California, where I did rock sampling on an 800-meter grid a long time ago. But it could be anywhere at any scale.

So, I'm imagining a traverse across the study area and stopping every so often to make observations. Stop 1, measure variables, stop 2, measure variables ... stop 705,184 measure variables. Repeat 19 times. That's 6,123,199,502 observations.

If I'm being dense, again apologies, but that's what we're dealing with?

Thanks I will take a look

My apologies, the reprex fails to reproduce the data. I am basically extracting time series vegetation and climate data over spatial points, then store this in a big matrix. The code fails to store data for all the rows/lonlat points, indicating that lengths of my items are different.

I do understand the big picture. I don't understand the little picture, which is how many spatial points there are, not in the entire expanse of the study area on some grid, but how many were data acquired from. I ask because that bears on the size of the object to be handled and so the appropriate type of data object to handle it.

The spatial points were created from the vegetation grid dataset and were 705184 in total. They were then saved to disk as an rds file. The same points were loaded back in R and used in the analysis.

I'm sorry to keep going in circles. Am I correct that for each and every one of the variables, there are observations for every spatial point? Conversely, although each observation is associated with only one spatial point are there only a relatively few spatial points that have observations?

Yes, correct. Most spatial points have observations while a few do not have. I ran the code below:

print(paste("There are ", sum(!is.na(ts.evi)), "data points extracted."))
[1] "There are 317529899 data points extracted."
print(paste("There are ", sum(is.na(ts.evi)), "NAs"))
[1] "There are 4739189 NAs"

1 Like

Wow, that is a lot of data. I'm going to assume that by "store" is meant a way to gather this all into a single object for later extraction and processing. I have some ideas in mind and will post back.

Thanks, I will be waiting.

It appears that R crushes when I do a reprex of the entire code. I have decided to paste 3 components of the entire script. I have also commented where I suspect the error message ('Error in setRows.bm(x, i, value) ; number of items to replace is not a multiple of replacement length') is originating from. I hope this helps.

library(reprex)
#> Warning: package 'reprex' was built under R version 4.0.4
library(raster)
#> Warning: package 'raster' was built under R version 4.0.4
#> Loading required package: sp
#> Warning: package 'sp' was built under R version 4.0.4
library(ncdf4)
library(bigmemory)
#> Warning: package 'bigmemory' was built under R version 4.0.4
library(foreach)
#> Warning: package 'foreach' was built under R version 4.0.4
#library(doMC)
library(doParallel)
#> Warning: package 'doParallel' was built under R version 4.0.4
#> Loading required package: iterators
#> Warning: package 'iterators' was built under R version 4.0.4
#> Loading required package: parallel
library(raster)
library(ncdf4)
library(rgdal)
#> Warning: package 'rgdal' was built under R version 4.0.4
#> rgdal: version: 1.5-23, (SVN revision 1121)
#> Geospatial Data Abstraction Library extensions to R successfully loaded
#> Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
#> Path to GDAL shared files: C:/Users/Edward Muhoko/Documents/R/win-library/4.0/rgdal/gdal
#> GDAL binary built with GEOS: TRUE 
#> Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
#> Path to PROJ shared files: C:/Users/Edward Muhoko/Documents/R/win-library/4.0/rgdal/proj
#> PROJ CDN enabled: FALSE
#> Linking to sp version:1.4-5
#> To mute warnings of possible GDAL/OSR exportToProj4() degradation,
#> use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
#> Overwritten PROJ_LIB was C:/Users/Edward Muhoko/Documents/R/win-library/4.0/rgdal/proj
library(sp)

#Load the script needed to run do_pixel
source("~/Work/dopixel_2020.R")
ERA.dates <- readRDS("~/Work/ERA.dates.rds") #240 dates

#Load environmental variables

ts.evi <- readRDS("~/Work/ts.evi.rds")
ts.rel <- readRDS("~/Work/ts.rel.rds")
tile.xy <- readRDS("~/Work/tile.xy.rds")
ts.moist1 <- readRDS("~/Work/ts.moist1.rds")
ts.moist2 <- readRDS("~/Work/ts.moist2.rds")
ts.srad <- readRDS("~/Work/ts.srad.rds")
ts.tair <- readRDS("~/Work/ts.tair.rds")
ts.tsoil <- readRDS("~/Work/ts.tsoil.rds")
# get the layer names provided by the call to extract evi:
layer.names<-labels(ts.evi)[[2]]   
# take the substring on these names, which give the dates:
date.text <-substr(layer.names,start=10,stop=16)
date.pos <- strptime(date.text,  format = "%Y%j", tz="UTC") #convert to R date format
# get the layer names provided by the call to extract evi:
layer.names<-labels(ts.evi)[[2]]   
# take the substring on these names, which give the dates:
date.text <-substr(layer.names,start=10,stop=16)
date.pos <- strptime(date.text,  format = "%Y%j", tz="UTC") #convert to R date format

Created on 2021-03-08 by the reprex package (v1.0.0)

pixel.date <- as.Date(seq.POSIXt(ISOdate(2000,6,1),by="week",length.out=1020,tz="UTC"))
#tail(pixel.date)
YEAR <- as.numeric(format(pixel.date, "%Y"))
YITS = length(unique(YEAR))
names<-c("lon","lat",
         "lq", "uq", "mean.evi", "sd.evi", "sum.evi", "amplitude",
         "peak.day", "trough.day",
         paste("td", 1:YITS, sep="."), paste("td.x", 1:YITS, sep="."), paste("td.evi", 1:YITS, sep="."),
         paste("pd", 1:YITS, sep="."), paste("pd.x", 1:YITS, sep="."), paste("pd.evi", 1:YITS, sep="."),
         paste("elon.m", 1:YITS, sep="."), paste("elon.m.x", 1:YITS, sep="."),
         paste("eoff.m", 1:YITS, sep="."), paste("eoff.m.x", 1:YITS, sep="."),
         paste("elon.f", 1:YITS, sep="."), paste("elon.f.x", 1:YITS, sep="."),
         paste("eoff.f", 1:YITS, sep="."), paste("eoff.f.x", 1:YITS, sep="."),
         paste("elon.l", 1:YITS, sep="."), paste("elon.l.x", 1:YITS, sep="."),
         paste("eoff.l", 1:YITS, sep="."), paste("eoff.l.x", 1:YITS, sep="."),
         paste("sum.evi.yr", 1:YITS, sep="."), paste("amp", 1:YITS, sep="."),
         paste("gsl", 1:YITS, sep="."), paste("gsl.peak", 1:YITS, sep="."), paste("gsl.long", 1:YITS, sep="."),
         paste("elon.f.evi", 1:YITS, sep="."), paste("elon.m.evi", 1:YITS, sep="."), paste("elon.l.evi", 1:YITS, sep="."),
         paste("eoff.f.evi", 1:YITS, sep="."), paste("eoff.m.evi", 1:YITS, sep="."), paste("eoff.l.evi", 1:YITS, sep="."),
         paste("tair.rank.td", 1:YITS, sep="."),paste("tair.rank.pd", 1:YITS, sep="."),
         paste("moist1.rank.td", 1:YITS, sep="."),paste("moist1.rank.pd", 1:YITS, sep=".")
) 
statsMODIS<-big.matrix(nrow=length(tile.xy), ncol=length(names), type="double",
                       dimnames=list(row.names=NULL, col.names=names),
                       backingfile="statsMODIS.bin", descriptorfile="statsMODIS.desc")
#> Error in big.matrix(nrow = length(tile.xy), ncol = length(names), type = "double", : could not find function "big.matrix"

Created on 2021-03-08 by the reprex package (v1.0.0)

ERA.dates <- ERA.dates+14
#> Error in eval(expr, envir, enclos): object 'ERA.dates' not found
pixel.df = data.frame(date = pixel.date)
#> Error in data.frame(date = pixel.date): object 'pixel.date' not found
for(p in 1:dim(ts.evi)[1]) #for all pixels in ts.evi, this will be slow!
{
  print(p)
  #-0-only call dopixel if there is sufficient evi data in the pixel.
  if( length(which(is.na(ts.evi[p,])))  < 100 ) # there are circa 440 data points
  {
    #-1-create a data.frame for the pixel
    pixel.df = data.frame(date = pixel.date)
    #-2-create an approx function for each ERA variable
    af.tair = approxfun(ERA.dates,ts.tair[1,])
    af.tsoil = approxfun(ERA.dates,ts.tsoil[1,])
    af.srad = approxfun(ERA.dates,ts.srad[1,])
    af.moist1 = approxfun(ERA.dates,ts.moist1[1,])
    af.moist2 = approxfun(ERA.dates,ts.moist2[1,])
    #-3-create an approx function for the EVI data
    af.evi = approxfun(as.Date(date.pos),ts.evi[p,])
    #-4-use the approxfun's to fill the data frame data.frame
    pixel.df$tair <- af.tair(as.Date(pixel.date))
    pixel.df$tsoil <- af.tsoil(as.Date(pixel.date))
    pixel.df$srad <- af.srad(as.Date(pixel.date))
    pixel.df$moist1 <- af.moist1(as.Date(pixel.date))
    pixel.df$moist2 <- af.moist2(as.Date(pixel.date))
    pixel.df$evi <- af.evi(as.Date(pixel.date))
    pixel.df<-pixel.df[complete.cases(pixel.df),]
    #head(pixel.df)
    coords<-round(coordinates(tile.xy)[p,],2)
    statsMODIS[p,] <- dopixel(pixel.df,coords,PLOT=FALSE)#I suspect this is where the error is coming from
  } else 
  {
    coords<-round(coordinates(tile.xy)[p,],2)
    statsMODIS[p,] <- c(coords,rep(-9999,length=length(names)-2))
  }
} 
#> Error in eval(expr, envir, enclos): object 'ts.evi' not found

Created on 2021-03-08 by the reprex package (v1.0.0)