Generating new variable base on mean of other columns within datafame

I would like to generate multiple variables based on the mean value of other columns within the data frame. Below is one of the variables, but I have a lot of variables to do like that.

How could I do in these in the loop or any thought !!


df$mean_v1<- 0

df$mean_v1<- mean(df$v1,na.rm=TRUE)

I already listed these variables that I want to calculate the mean.


varlist <- list(v1,v2,v2,v3,...)

I try to put these all into the forloop as below but it doesn't work.


for ( x in varlist) {

df$mean_x<-mean(df[[x]],na.rm==TRUE)

}

please suggest to me !!

I would use the mutate() and across() functions from the dplyr package.

Edit:
If you do not want to do the calculation for all of the columns, they can be selected in many ways including simply listing them in a character vector and passing it to the .cols argument of across().

DF <- data.frame(V1=1:4,V2=3:6,V3=10:13)
library(dplyr,warn.conflicts = FALSE)
DF <-DF %>%  mutate(across(.fns = mean,,.names = "mean_{col}"))
DF
#>   V1 V2 V3 mean_V1 mean_V2 mean_V3
#> 1  1  3 10     2.5     4.5    11.5
#> 2  2  4 11     2.5     4.5    11.5
#> 3  3  5 12     2.5     4.5    11.5
#> 4  4  6 13     2.5     4.5    11.5

Created on 2020-10-17 by the reprex package (v0.3.0)

Your suggestion is very helpful for my cases,
I did coding as below sd and mean for the selected column using the way you suggested.

df <- df %>%  mutate(across(.cols=c(r),.fns=mean,.names = "mean_{col}",na.rm = TRUE))
df <- df %>%  mutate(across(.cols=c(r),.fns=sd,.names = "sd_{col}",na.rm = TRUE))

Furthermore, I also want to calculate max=mean + sd*3 for the selected columns respectively.
I coded the following but getting errors.

df <- df %>%  mutate(across(.cols=c(r),.fns=mean+sd*3,.names = "max_{col}",na.rm = TRUE))

I'm a very new learner of R, could you mind me to show the ways of it.

Thanks

Here is a way to pass multiple function all at once within across() and also define your own function using ~.

DF <- data.frame(V1=1:4,V2=3:6,V3=10:13)
library(dplyr,warn.conflicts = FALSE)
DF <- DF %>%  mutate(across(.cols = c("V1", "V3"), .fns = list(Mean=mean, SD=sd, Max = ~mean(.x) + 3*sd(.x)),
                            .names = "{fn}_{col}"))

I used the ways you suggested,

df2 <- df %>%  mutate(across(.cols = c(r), .fns = list(Mean=mean,SD=sd, Max = ~mean(.x) + 3*sd(.x)),
                            .names = "{fn}_{col}",na.rm=TRUE))

But, some of Max variables have only NA even those variables have mean and sd values. those variables might have some missing value, but they could produce mean and sd.
Anything to be improved in the coding

I added V4 with missing in DF

DF <- data.frame(V1=1:4,V2=3:6,V3=10:13,V4=c(4,NA,2,NA))

library(dplyr,warn.conflicts = FALSE)
DF <- DF %>%  mutate(across(.cols = c("V1", "V3","V4"), .fns = list(Mean=mean, SD=sd, Max = ~mean(.x) + 3*sd(.x)),
                            .names = "{fn}_{col}",na.rm=TRUE))

 DF
  V1 V2 V3 V4 Mean_V1    SD_V1   Max_V1 Mean_V3    SD_V3   Max_V3 Mean_V4    SD_V4 Max_V4
1  1  3 10  4     2.5 1.290994 6.372983    11.5 1.290994 15.37298       3 1.414214     NA
2  2  4 11 NA     2.5 1.290994 6.372983    11.5 1.290994 15.37298       3 1.414214     NA
3  3  5 12  2     2.5 1.290994 6.372983    11.5 1.290994 15.37298       3 1.414214     NA
4  4  6 13 NA     2.5 1.290994 6.372983    11.5 1.290994 15.37298       3 1.414214     NA

There is missing in Max_V4, please suggest how to fix it.

You can add na.rm = TRUE to the mean() and sd() function used in the definition of the function that returns Max.

DF <- DF %>%  mutate(across(.cols = c("V1", "V3","V4"), 
                            .fns = list(Mean=mean, SD=sd, 
                                        Max = ~mean(.x, na.rm = TRUE) + 3*sd(.x, na.rm = TRUE)),
                            .names = "{fn}_{col}",na.rm=TRUE))

I got how to fix it, putting na.rm=TRUE for Max function

DF <- DF %>%  mutate(across(.cols = c("V1", "V3","V4"), .fns = list(Mean=mean, SD=sd, Max = ~mean(.x,na.rm=TRUE) + 3*sd(.x,na.rm=TRUE)),
                            .names = "{fn}_{col}",na.rm=TRUE))
DF

V1 V2 V3 V4 Mean_V1    SD_V1   Max_V1 Mean_V3    SD_V3   Max_V3 Mean_V4    SD_V4   Max_V4
1  1  3 10  4     2.5 1.290994 6.372983    11.5 1.290994 15.37298       3 1.414214 7.242641
2  2  4 11 NA     2.5 1.290994 6.372983    11.5 1.290994 15.37298       3 1.414214 7.242641
3  3  5 12  2     2.5 1.290994 6.372983    11.5 1.290994 15.37298       3 1.414214 7.242641
4  4  6 13 NA     2.5 1.290994 6.372983    11.5 1.290994 15.37298       3 1.414214 7.242641

Thanks

Thanks for your suggestions,
I have still more to do, like subsetting, V1,...., value is or equal or greater than Max_V1,... respectively, and then save them into data frame individually.
I did the following codes.


DF <- data.frame(V1=1:11, 
                 V2=c(0,1,0,50,1,0,-10,0,1,0,NA),
                 V3=21:31,
                 V4=c(0,1,0,50,1,0,-10,0,1,0,NA))

library(dplyr,warn.conflicts = FALSE)
v=list("V1", "V2","V3","V4")

DF <- DF %>%  mutate(across(.cols = c("V1", "V2","V3","V4"), .fns = list( Max = ~mean(.x,na.rm=TRUE) + 2*sd(.x,na.rm=TRUE)),
                            .names = "{fn}_{col}"))

Subsetting them into individual data frame once V1,..., is greater or equal to Max_V1,--- respectively .

for ( i in v) {
  print(i)
  M<-paste0("Max_",i,sep="") 
  assign(paste0("s",i),subset(DF, i >= M))
}

The subsetting doesn't apply them with that condition, and still is saving all rows

sV4
   V1  V2 V3  V4   Max_V1   Max_V2   Max_V3   Max_V4
1   1   0 21   0 12.63325 37.08279 32.63325 37.08279
2   2   1 22   1 12.63325 37.08279 32.63325 37.08279
3   3   0 23   0 12.63325 37.08279 32.63325 37.08279
4   4  50 24  50 12.63325 37.08279 32.63325 37.08279
5   5   1 25   1 12.63325 37.08279 32.63325 37.08279
6   6   0 26   0 12.63325 37.08279 32.63325 37.08279
7   7 -10 27 -10 12.63325 37.08279 32.63325 37.08279
8   8   0 28   0 12.63325 37.08279 32.63325 37.08279
9   9   1 29   1 12.63325 37.08279 32.63325 37.08279
10 10   0 30   0 12.63325 37.08279 32.63325 37.08279
11 11  NA 31  NA 12.63325 37.08279 32.63325 37.08279

Once V4 is 50 and Max_V4=37.08279, should subset only the row of it, but not happens, please suggest!!

I would use an entirely different approach than what has been discussed so far. I would reshape the data to a longer format so the max of each Vx can be easily calculated and compared against the individual values. The final data frame in the following code has a RowFlag column that is TRUE if the Max value for any Vx is exceeded.

library(tidyr)
library(dplyr, warn.conflicts = FALSE)

DF <- data.frame(V1=1:11, 
                 V2=c(0,1,0,50,1,0,-10,0,1,0,NA),
                 V3=21:31,
                 V4=c(0,1,0,50,1,0,-10,0,1,0,NA))
#Add a row identifier
DF <- DF %>% mutate(RowNum = row_number())

#reshape the data to a long format
DFlong <- DF %>% pivot_longer(V1:V4, names_to = "V_value", values_to = "Value")

#Calculate the Max of V1, V2, etc.
V_Stats <- DFlong %>% group_by(V_value) %>% 
  summarize(Max = mean(Value, na.rm = TRUE) + 2 * sd(Value, na.rm = TRUE))
#> `summarise()` ungrouping output (override with `.groups` argument)
V_Stats
#> # A tibble: 4 x 2
#>   V_value   Max
#>   <chr>   <dbl>
#> 1 V1       12.6
#> 2 V2       37.1
#> 3 V3       32.6
#> 4 V4       37.1

#Mark each value whether or not it is greater than the respective Max
DFlong <- inner_join(DFlong, V_Stats, by = "V_value")
DFlong <- DFlong %>% mutate(Flag = Value > Max)

#Calc if any Max is exceeded for each row identifier
FlagStats <- DFlong %>% group_by(RowNum) %>% summarize(RowFlag = any(Flag, na.rm = TRUE))
#> `summarise()` ungrouping output (override with `.groups` argument)

#Make the data wide again and join the RowFlag values
DFwide <- DFlong %>% select(-Flag) %>% 
  pivot_wider(names_from = V_value, values_from = c("Value", "Max"))
DFwide <- inner_join(DFwide, FlagStats, by = "RowNum")
DFwide
#> # A tibble: 11 x 10
#>    RowNum Value_V1 Value_V2 Value_V3 Value_V4 Max_V1 Max_V2 Max_V3 Max_V4
#>     <int>    <dbl>    <dbl>    <dbl>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1      1        1        0       21        0   12.6   37.1   32.6   37.1
#>  2      2        2        1       22        1   12.6   37.1   32.6   37.1
#>  3      3        3        0       23        0   12.6   37.1   32.6   37.1
#>  4      4        4       50       24       50   12.6   37.1   32.6   37.1
#>  5      5        5        1       25        1   12.6   37.1   32.6   37.1
#>  6      6        6        0       26        0   12.6   37.1   32.6   37.1
#>  7      7        7      -10       27      -10   12.6   37.1   32.6   37.1
#>  8      8        8        0       28        0   12.6   37.1   32.6   37.1
#>  9      9        9        1       29        1   12.6   37.1   32.6   37.1
#> 10     10       10        0       30        0   12.6   37.1   32.6   37.1
#> 11     11       11       NA       31       NA   12.6   37.1   32.6   37.1
#> # ... with 1 more variable: RowFlag <lgl>

#Can filter DFwide on the value of RowFlag to get the rows where
# any Vx value exceeds Max_Vx

Created on 2020-10-18 by the reprex package (v0.3.0)

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.