Creating a program that loops 5 values into a mean in one row and places it into a new row repeatedly

AlexisW · April 3, 2023, 12:09am

One note on this: if you want to compare your 5 measurements/second to your other datasets with 1 measurement/second, taking the mean may not be a good idea: you'll end up with smoothing the signal, which might change its properties. I would recommend keeping 1 datapoint per second and discarding the other 4, it'll be a more realistic estimate of what you'd get with 1 measurement/second.

I agree with that, the way R works with data is usually with data of a single type in a column, not in a row.

What jkrideau offers with {data.table} seems perfectly good, but just in case here is a solution using the {tidyverse} instead, that I find more easily readable (it's a matter of personal preference, but since you use {tibble} I assume you already know about the tidyverse).

There are 2 ways to approach it: either you assume that you always have exactly 5 measurements per second, so you count the datapoints by giving them some kind of index, then you can use this index to select the points of interest. Or you assume that the time given in the column title is reliable, and you extract the number of seconds since the start of the measurements. For this I'll use the package {lubridate}, part of the tidyverse.

I'll also use tidyr::pivot_longer() and functions from {dplyr}, all part of the tidyverse.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union


dat0 <- tibble::tribble(
  ~ID, ~`'0:0:0.0`, ~`'0:0:0.2`, ~`'0:0:0.4`, ~`'0:0:0.6`, ~`'0:0:0.8`, ~`'0:0:1.0`, ~`'0:0:1.2`, ~`'0:0:1.4`, ~`'0:0:1.6`, ~`'0:0:1.8`, ~`'0:0:2.0`, ~`'0:0:2.2`, ~`'0:0:2.4`, ~`'0:0:2.6`, ~`'0:0:2.8`, ~`'0:0:3.0`, ~`'0:0:3.2`, ~`'0:0:3.4`, ~`'0:0:3.6`, ~`'0:0:3.8`, ~`'0:0:4.0`, ~`'0:0:4.2`, ~`'0:0:4.4`, ~`'0:0:4.6`, ~`'0:0:4.8`, ~`'0:0:5.0`, ~`'0:0:5.2`, ~`'0:0:5.4`, ~`'0:0:5.6`, ~`'0:0:5.8`, ~`'0:0:6.0`, ~`'0:0:6.2`, ~`'0:0:6.4`, ~`'0:0:6.6`, ~`'0:0:6.8`,
  "SS07",       3.3155,      3.2896,      3.2616,      3.2362,      3.2108,      3.1917,      3.1922,      3.2224,      3.3035,      3.3769,      3.4168,      3.4379,       3.439,       3.429,      3.4038,      3.3954,      3.4381,      3.5125,      3.5836,      3.6335,      3.6605,      3.6707,      3.6636,      3.6455,      3.6129,      3.5788,      3.5375,      3.4973,      3.4596,      3.4205,      3.3855,      3.3557,      3.3372,      3.3372,      3.3482,
  "SS07B",           NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA,          NA
)


# Make it into a column
# optional, convert the time to a period since the start of measurement
dat1 <- dat0 |>
  filter(ID == "SS07") |>
  tidyr::pivot_longer(-ID,
               names_to = "time",
               values_to = "measure") |>
  mutate(time = lubridate::hms(time))


dat1
#> # A tibble: 35 × 3
#>    ID    time     measure
#>    <chr> <Period>   <dbl>
#>  1 SS07  0S          3.32
#>  2 SS07  0.2S        3.29
#>  3 SS07  0.4S        3.26
#>  4 SS07  0.6S        3.24
#>  5 SS07  0.8S        3.21
#>  6 SS07  1S          3.19
#>  7 SS07  1.2S        3.19
#>  8 SS07  1.4S        3.22
#>  9 SS07  1.6S        3.30
#> 10 SS07  1.8S        3.38
#> # … with 25 more rows

# first approach, we create a column of indexes, and we use those to select every 5th measurement
dat1 |>
  mutate(index = rep(1:5, times = nrow(dat1)/5))
#> # A tibble: 35 × 4
#>    ID    time     measure index
#>    <chr> <Period>   <dbl> <int>
#>  1 SS07  0S          3.32     1
#>  2 SS07  0.2S        3.29     2
#>  3 SS07  0.4S        3.26     3
#>  4 SS07  0.6S        3.24     4
#>  5 SS07  0.8S        3.21     5
#>  6 SS07  1S          3.19     1
#>  7 SS07  1.2S        3.19     2
#>  8 SS07  1.4S        3.22     3
#>  9 SS07  1.6S        3.30     4
#> 10 SS07  1.8S        3.38     5
#> # … with 25 more rows

# then we can select the measurements with index `1`
dat1 |>
  mutate(index = rep(1:5, times = nrow(dat1)/5)) |>
  filter(index == 1)
#> # A tibble: 7 × 4
#>   ID    time     measure index
#>   <chr> <Period>   <dbl> <int>
#> 1 SS07  0S          3.32     1
#> 2 SS07  1S          3.19     1
#> 3 SS07  2S          3.42     1
#> 4 SS07  3S          3.40     1
#> 5 SS07  4S          3.66     1
#> 6 SS07  5S          3.58     1
#> 7 SS07  6S          3.39     1


# same, but we give the same index to every group of 5 points
dat1 |>
  mutate(index = rep(1:(nrow(dat1)/5), each = 5))
#> # A tibble: 35 × 4
#>    ID    time     measure index
#>    <chr> <Period>   <dbl> <int>
#>  1 SS07  0S          3.32     1
#>  2 SS07  0.2S        3.29     1
#>  3 SS07  0.4S        3.26     1
#>  4 SS07  0.6S        3.24     1
#>  5 SS07  0.8S        3.21     1
#>  6 SS07  1S          3.19     2
#>  7 SS07  1.2S        3.19     2
#>  8 SS07  1.4S        3.22     2
#>  9 SS07  1.6S        3.30     2
#> 10 SS07  1.8S        3.38     2
#> # … with 25 more rows

# we take the mean of the points with same index
dat1 |>
  mutate(index = rep(1:(nrow(dat1)/5), each = 5)) |>
  group_by(index) |>
  summarize(mean_measure = mean(measure))
#> # A tibble: 7 × 2
#>   index mean_measure
#>   <int>        <dbl>
#> 1     1         3.26
#> 2     2         3.26
#> 3     3         3.43
#> 4     4         3.51
#> 5     5         3.65
#> 6     6         3.50
#> 7     7         3.35


# second approach, we use the actual value of the time, not relying on the number of points
dat1 |>
  mutate(fraction_of_seconds = lubridate::second(time) - round(lubridate::second(time)))
#> # A tibble: 35 × 4
#>    ID    time     measure fraction_of_seconds
#>    <chr> <Period>   <dbl>               <dbl>
#>  1 SS07  0S          3.32                 0  
#>  2 SS07  0.2S        3.29                 0.2
#>  3 SS07  0.4S        3.26                 0.4
#>  4 SS07  0.6S        3.24                -0.4
#>  5 SS07  0.8S        3.21                -0.2
#>  6 SS07  1S          3.19                 0  
#>  7 SS07  1.2S        3.19                 0.2
#>  8 SS07  1.4S        3.22                 0.4
#>  9 SS07  1.6S        3.30                -0.4
#> 10 SS07  1.8S        3.38                -0.2
#> # … with 25 more rows

dat1 |>
  mutate(fraction_of_seconds = lubridate::second(time) - round(lubridate::second(time))) |>
  filter(fraction_of_seconds == 0)
#> # A tibble: 7 × 4
#>   ID    time     measure fraction_of_seconds
#>   <chr> <Period>   <dbl>               <dbl>
#> 1 SS07  0S          3.32                   0
#> 2 SS07  1S          3.19                   0
#> 3 SS07  2S          3.42                   0
#> 4 SS07  3S          3.40                   0
#> 5 SS07  4S          3.66                   0
#> 6 SS07  5S          3.58                   0
#> 7 SS07  6S          3.39                   0

# Or using the second it was measured in to take the mean
dat1 |>
  mutate(seconds = floor(lubridate::second(time)))
#> # A tibble: 35 × 4
#>    ID    time     measure seconds
#>    <chr> <Period>   <dbl>   <dbl>
#>  1 SS07  0S          3.32       0
#>  2 SS07  0.2S        3.29       0
#>  3 SS07  0.4S        3.26       0
#>  4 SS07  0.6S        3.24       0
#>  5 SS07  0.8S        3.21       0
#>  6 SS07  1S          3.19       1
#>  7 SS07  1.2S        3.19       1
#>  8 SS07  1.4S        3.22       1
#>  9 SS07  1.6S        3.30       1
#> 10 SS07  1.8S        3.38       1
#> # … with 25 more rows

dat1 |>
  mutate(seconds = floor(lubridate::second(time))) |>
  group_by(seconds) |>
  summarize(mean_measure = mean(measure))
#> # A tibble: 7 × 2
#>   seconds mean_measure
#>     <dbl>        <dbl>
#> 1       0         3.26
#> 2       1         3.26
#> 3       2         3.43
#> 4       3         3.51
#> 5       4         3.65
#> 6       5         3.50
#> 7       6         3.35

^{Created on 2023-04-02 by the reprex package (v2.0.1)}