Create a for loop to automate an analysis

nikos_geo · February 18, 2023, 9:01pm

I have a data.frame with 41 columns. The first column is my dependent variable (called ntl) and the rest of the columns are me independent variables. The independent variables are named pop, tirs, agbh, lc and they have an associated number to their names which starts from 020 to 200 with step 20 (for example, pop020, lc080, agbh140 etc).

I am running multiple random forest (RF) regression models and each time I am using the dependent variable and a set of 4 independent variables. The set of the independent variables should have the same 3 digit number, for instance the model will be ntl ~ pop080 + tirs080 + agbh080 + lc080 or ntl ~ pop200 + tirs200 + agbh200 + lc200 etc, After finishing the RF I am saving the prediction and the residuals of the RF as rf_pred020.tif and rf_resids020.tif, depending on the 3digit I used in the analysis.

So far, I am chaning manually the 3 digit number where necessary. I want to create a for loop that changes automatically the 3 digit numbers where necessary after every RF model.

Here is the code:

library(caret)
library(terra)
library(randomForest)
library(doParallel)

wd = "path/"

s <- vect(paste0(wd, "lc.shp"))

block.data = read.csv(paste0(wd, "block.data.csv"))

block.data = subset(block.data, select = c(ntl, tirs020, pop020, agbh020, lc020))

set.seed(1123)

samp <- sample(nrow(block.data), 0.80 * nrow(block.data))

train <- block.data[samp, ]

test <- block.data[-samp, ]

no_cores <- detectCores() - 1
cl = makePSOCKcluster(no_cores)
registerDoParallel(cl)

# define the control
trControl = trainControl(method = "repeatedcv", 
                         number = 10, 
                         search = "grid", 
                         repeats = 5,
                         savePredictions = FALSE)

rf_default = train(ntl ~ pop020 + tirs020 + agbh020 + lc020, 
                   data = train, 
                   method = "rf", 
                   metric = "Rsquared", 
                   # tuneLenght = 50,
                   trControl = trControl)

print(rf_default)
# plot(rf_default)

# Search best mtry
set.seed(1234444)
tuneGrid <- expand.grid(.mtry = c(1:4))
rf_mtry <- train(ntl ~ pop + tirs + agbh,
                 data = train,
                 method = "rf",
                 metric = "Rsquared",
                 tuneGrid = tuneGrid,
                 trControl = trControl,
                 importance = TRUE,
                 nodesize = 20,
                 ntree = 1000)
# print(rf_mtry)

best_mtry <- rf_mtry$bestTune$mtry
# best_mtry

# search best maxnodes
# store_maxnode = list()
tuneGrid = expand.grid(.mtry = best_mtry)

best.rsq <- -1
best.maxnodes <- 0
for (maxnodes in c(5:50)){
  set.seed(3455556)
  rf_maxnode = train(ntl ~ pop020 + tirs020 + agbh020 + lc020, 
                     data = train, 
                     method = "rf", 
                     metric = "Rsquared", 
                     tuneGrid = tuneGrid, 
                     trControl = trControl, 
                     importance = TRUE, 
                     nodesize = 20, 
                     maxnodes = maxnodes,
                     # tuneLenght = 50,
                     ntree = 1000)
  rsq <- rf_maxnode$finalModel$rsq
  if (rsq > best.rsq) {
    best.rsq <- rsq
    best.maxnodes <- maxnodes
  }
}

# print(paste0("Best R-squared: ", best.rsq, " with maxnodes: ", best.maxnodes))
# best.maxnodes

# search best ntree
# store_maxtrees = list()

best.ntree <- -1
best.rsq <- -1
for (ntree in seq(from = 500, to = 8000, by = 500)) {
  set.seed(67777789)
  rf_maxtrees = train(ntl ~ pop020 + tirs020 + agbh020 + lc020, 
                      data = train, 
                      method = "rf", 
                      metric = "Rsquared", 
                      tuneGrid = tuneGrid, 
                      trControl = trControl, 
                      importance = TRUE, 
                      nodesize = 20, 
                      maxnodes = best.maxnodes, 
                      # tuneLenght = 50,
                      ntree = ntree)
  rsq <- rf_maxtrees$finalModel$rsq
  if (rsq > best.rsq) {
    best.rsq <- rsq
    best.ntree <- ntree
  }
}
# print(paste0("Best R-squared: ", best.rsq, " with best tree number: ", best.ntree))
# best.ntree

# train the model with the optimum rf
fit_rf = train(ntl ~ pop020 + tirs020 + agbh020 + lc020, 
               train, 
               method = "rf", 
               metric = "Rsquared", 
               tuneGrid = tuneGrid, 
               trControl = trControl, 
               importance = TRUE,
               # tuneLenght = 50, 
               nodesize = 20, 
               ntree = best.ntree, 
               maxnodes = best.maxnodes)

fit_rf

# check Rsquared in the test data
fit_rf2 = train(ntl ~ pop020 + tirs020 + agbh020 + lc020, 
                test, 
                method = "rf", 
                metric = "Rsquared", 
                tuneGrid = tuneGrid, 
                trControl = trControl, 
                importance = TRUE,
                # tuneLenght = 50,
                nodesize = 20, 
                ntree = best.ntree, 
                maxnodes = best.maxnodes)

fit_rf2

stopCluster(cl)

pop = rast(paste0(wd, "pop.tif"))
pop_res = rast(paste0(wd, "pop020.tif"))

tirs = rast(paste0(wd, "tirs.tif"))
tirs_res = rast(paste0(wd, "tirs020.tif"))

agbh = rast(paste0(wd, "agbh.tif"))
agbh_res = rast(paste0(wd, "agbh020.tif"))

lc = rast(paste0(wd, "lc.tif"))
lc_res = rast(paste0(wd, "lc020.tif"))

sm = c(pop, tirs, agbh, lc)
names(sm) = c("pop", "tirs", "agbh", "lc")
res(sm)

ntl = rast(paste0(wd, "ntl.tif"))
ntl = resample(ntl, pop_res)

sb = c(ntl, pop_res, tirs_res, agbh_res, lc_res)
names(sb) = c("ntl", "pop", "tirs", "agbh", "lc")

# apply best model (fit_rf) using the whole data.set
m = randomForest(ntl ~ pop020 + tirs020 + agbh020 + lc020, 
                 data = sb, 
                 mtry = best_mtry, 
                 importance = TRUE, 
                 na.action = na.omit, 
                 nodesize = 20, 
                 maxnodes = best.maxnodes, 
                 ntree = best.ntree, 
                 corr.bias = TRUE,
                 replace = TRUE)

m

# predict rf prediction
ps = predict(sm, m, na.rm = TRUE)

ps = crop(ps, ext(s))
ps = mask(ps, s)

writeRaster(ps, paste0(wd, "rf_pred020.tif"), overwrite = TRUE)

# export rf residuals
pb = predict(sb, m, na.rm = TRUE)

rsds = sb$ntl - pb

rsds = crop(rsds, ext(s))
rsds = mask(rsds, s)

writeRaster(rsds, paste0(wd, "rf_resids020.tif"), overwrite = TRUE)

I have tried to create a list of the column names with a 3-digit number using the from, to and seq commands and then use a for loop to loop over the column names and modify the column names in the model formula, read the corresponding raster layer, and perform model fitting and prediction but without success. Can someone provide some guidance? I can share the data.frame via GDrive.

FactOREO · February 18, 2023, 9:50pm

If the main goal is to switch between the numbers (and nothin else), then put everything you have inside a function which takes the number as an argument. Then, you have to create the formula once at the beginning as follows:

nr <- "020"
indep_vars <- paste0(c("pop", "tirs", "agbh", "lc"), nr)
as.formula(paste0("ntl ~ ", paste(indep_vars, collapse = " + ")))
[1] ntl ~ pop020 + tirs020 + agbh020 + lc020

where nr is your function input to change the number at the end. The result is a language object which should be usable inside regression models (since it works inside usual lm(), I guess there should be no problems whatsoever with your packages).

If your function is well written and outputs only your writeRaster() at the end, you could use purrr::wal() and give all the numbers as a chr vector input together with your adjusted code inside a function, and walk() will calculate every regression and write the result to disk. Just remember to use paste or paste0 for all instances where the number is important and relies on your (function) input.

system · April 1, 2023, 9:50pm

This topic was automatically closed 42 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.