GLM/logistic regression code getting frozen/taking too long to run

eyavuz21 · April 9, 2020, 1:30pm

Hey all,

I am trying to run the following code. My computer keeps getting frozen when I try to run it. Therefore, I can see the correlation matrices, I am unable to view the results of the GLM/data arrays.

# running the assay


#which_p_value = "x1"
which_p_value = "groupcategory"
#which_p_value = "x1:groupcategory"

run_anova = FALSE 
simulate_mixed_effect = TRUE 
mixed_effect_sd = 3.094069
mixed_effect_sd_slope = 3.098661

library(tidyverse)
n_people <- c(2,5,10,15,20)
coef1 <- 1.61
coef2 <- -0.01
#coef3 <- 5
#coef4 <- 0

g1 = 0
g2 = 1
g3 = 2 
distances <- c(60,90,135,202.5,303.75,455.625)/100
n_trials <- 35
oneto1000 <- 25
n_track_lengths <- length(distances)
groupcategory = c(rep(g1, n_track_lengths), rep(g2, n_track_lengths),rep(g3,n_track_lengths))
z = c(n_people)
emptydataframeforpowerplots = NULL

coef3s <- c(-5, -4, -3, -2,-1, 0, 1, 2, 3, 4, 5)
coef4s <- c(-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1)

Datarray <- array(dim=c(length(coef3s), length(coef4s),length(n_people)))

coef3_counter =1
for (coef3 in coef3s) {
  coef4_counter =1
  for (coef4 in coef4s) {
    z1_g2 <- coef1 + coef2*distances + coef3*g2 + coef4*g2*distances
    z1_g3 <- coef1 + coef2*distances + coef3*g3 + coef4*g3*distances
    d = NULL
    pr1 = 1/(1+exp(-z1_g2))
    pr2 = 1/(1+exp(-z1_g3))
    
    counter=1
    for (i in n_people) {
      for (j in 1:oneto1000){
        df <- c()
        for (k in 1:i){
          
          # random effect from drawing a random intercept with sd = x
          
          if (simulate_mixed_effect){
            coef1_r = rnorm(1, mean=coef1, sd=mixed_effect_sd)
            coef2_r = rnorm(1, mean=coef1, sd=mixed_effect_sd_slope)
          } else {
            coef1_r = coef1
            coef2_r = coef2
          }
  
          z_g1 <- coef1_r + coef2*distances + coef3*g1 + coef4*g1*distances
          pr = 1/(1+exp(-z_g1))
          
          z1_g2 <- coef1_r + coef2*distances + coef3*g2 + coef4*g2*distances
          pr1 = 1/(1+exp(-z1_g2))
          
          if (run_anova) {
            df <- rbind(df, data.frame(x1 = c(rep(distances, 3)),
                                     y = c(rbinom(n_track_lengths,n_trials,pr), rbinom(n_track_lengths,n_trials,pr1),rbinom(n_track_lengths,n_trials,pr2)),
                                     groupcategory = groupcategory, id = c(rep(k,18))))
            
          } else { # this is for glmer data organisation
            for (m in 1:n_trials) {
            df <- rbind(df, data.frame(x1 = c(rep(distances, 3)),
 y = c(rbinom(n_track_lengths,1,pr),rbinom(n_track_lengths,1,pr1),rbinom(n_track_lengths,1,pr2)),groupcategory = groupcategory,id = c(rep(k,18))))
          
            }
          }
        }
        
        if (run_anova) {
          #df_aov <- aov(y~x1*groupcategory+Error(id/(x1*groupcategory)),data=df)
          #df_aov_sum <- summary(df_aov)
          #pvalue <- df_aov_sum[[5]][[1]][which_p_value,"Pr(>F)"]
          
          df_aov <- aov(y~x1*groupcategory+Error(id),data=df)
          df_aov_sum <- summary(df_aov)
          pvalue <- df_aov_sum[[2]][[1]][which_p_value, "Pr(>F)"]
        } 
 checkme <- df %>% group_by(groupcategory,id) %>% summarise(miny=min(y),maxy=max(y)) %>% mutate(expectfail = miny==maxy)
else { 
          mod_group_glmer <-  glmer(y ~ x1 + groupcategory + (1+x1|id), data = df, family = "binomial")
          sum <- summary(mod_group_glmer)
          pvalue <- sum$coefficients[which_p_value, "Pr(>|z|)"]
        }

        d = rbind(d,data.frame(pvalue))
      }
      count <- plyr::ldply(d,function(c) sum(c<=0.05))
      Datarray[coef3_counter,coef4_counter,counter] <- count$V1/oneto1000
      counter = counter +1
      d = NULL
    }
    coef4_counter = coef4_counter + 1
  }
  coef3_counter = coef3_counter + 1
}

Does anybody have any advice on how I can overcome this issue?

nirgrahamuk · April 16, 2020, 7:55am

in the code you provide here checkme serves no purpose.
You could move it up to the poiint where you rbind df's together.
the rbind of df's is the most expensive operation you run, many many times.
And a very high percentage of the time, because your data doesnt contain two target values, the data is useless to you. so maybe you can use the check and skip binding it.

system · May 7, 2020, 7:55am

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.