Sum a column then divide by the total number in another column

dplyr
#1

I would like to calculate the average time for all host by the event Type.

My code is:

df %>%
  group_by(hostname) %>%
  total_time=sum(time_task) %>%
  event_type_Av= total_time/count(hostname)
data.frame(stringsAsFactors=FALSE,
    timestamp = c("2018-11-08 07:42:11", "2018-11-08 07:42:55",
                  "2018-11-08 07:43:41", "2018-11-08 07:44:07",
                  "2018-11-08 07:44:57", "2018-11-08 07:45:44", "2018-11-08 07:46:28",
                  "2018-11-08 07:47:20", "2018-11-08 07:47:56", "2018-11-08 07:48:48"),
     hostname = c("host1",
                  "host2",
                  "host2",
                  "host3", 
                  "host2",
                  "host5",
                  "host5", 
                  "host3",
                  "host3",
                  "host1"),
   event_name = c("save", "upload", "render", "upload",
                  "save", "save", "render", "upload",
                  "upload", "render"),
   event_type = c("STOP", "STOP", "STOP", "STOP", "STOP", "STOP", "STOP",
                  "STOP", "STOP", "STOP"),
    time_task = c("25.8089997768402", "40.319000005722", "42.9910001754761",
                  "24.6840000152588", "46.1050000190735", "44.2489998340607",
                  "41.2440001964569", "49.4800000190735", "33.7000000476837",
                  "49.0550000667572"),
      task_id = c("00390eee-c26c-41da-a02d-556bb7fcac67",
                  "dbc599f6-694b-46c4-a864-e09ab881af37",
                  "0ad8d29d-d30c-48c9-bd0a-fbea985464b2", "52881801-4d75-4ada-a118-682aa1d5ddf9",
                  "5c14d761-26af-4602-a51d-6378a4ad7c24",
                  "fa8d5709-ffb6-4a8b-bd73-0076c1654d49", "0ebfe158-0c86-4cde-8742-20d13cc4076b",
                  "403c1ca4-f5d3-4831-8a66-0f8be10f5aeb",
                  "ffd69831-0ba4-457b-b8a8-e37c49779d94", "70a9ab55-b17f-4df6-82ef-146425d7bbfa"),
0 Likes

#2

You'd want to leverage mutate() and the n() function that will count number of obs in group. Here is some code that will work:

library(tidyverse)

df <- data.frame(stringsAsFactors=FALSE,
           timestamp = c("2018-11-08 07:42:11", "2018-11-08 07:42:55",
                         "2018-11-08 07:43:41", "2018-11-08 07:44:07",
                         "2018-11-08 07:44:57", "2018-11-08 07:45:44", "2018-11-08 07:46:28",
                         "2018-11-08 07:47:20", "2018-11-08 07:47:56", "2018-11-08 07:48:48"),
           hostname = c("host1",
                        "host2",
                        "host2",
                        "host3", 
                        "host2",
                        "host5",
                        "host5", 
                        "host3",
                        "host3",
                        "host1"),
           event_name = c("save", "upload", "render", "upload",
                          "save", "save", "render", "upload",
                          "upload", "render"),
           event_type = c("STOP", "STOP", "STOP", "STOP", "STOP", "STOP", "STOP",
                          "STOP", "STOP", "STOP"),
           time_task = c("25.8089997768402", "40.319000005722", "42.9910001754761",
                         "24.6840000152588", "46.1050000190735", "44.2489998340607",
                         "41.2440001964569", "49.4800000190735", "33.7000000476837",
                         "49.0550000667572"),
           task_id = c("00390eee-c26c-41da-a02d-556bb7fcac67",
                       "dbc599f6-694b-46c4-a864-e09ab881af37",
                       "0ad8d29d-d30c-48c9-bd0a-fbea985464b2", "52881801-4d75-4ada-a118-682aa1d5ddf9",
                       "5c14d761-26af-4602-a51d-6378a4ad7c24",
                       "fa8d5709-ffb6-4a8b-bd73-0076c1654d49", "0ebfe158-0c86-4cde-8742-20d13cc4076b",
                       "403c1ca4-f5d3-4831-8a66-0f8be10f5aeb",
                       "ffd69831-0ba4-457b-b8a8-e37c49779d94", "70a9ab55-b17f-4df6-82ef-146425d7bbfa")
)

df %>%
  group_by(hostname) %>%
  mutate(total_time = sum(as.numeric(time_task)),
         event_type_Av = total_time/n())
#> # A tibble: 10 x 8
#> # Groups:   hostname [4]
#>    timestamp hostname event_name event_type time_task task_id total_time
#>    <chr>     <chr>    <chr>      <chr>      <chr>     <chr>        <dbl>
#>  1 2018-11-… host1    save       STOP       25.80899… 00390e…       74.9
#>  2 2018-11-… host2    upload     STOP       40.31900… dbc599…      129. 
#>  3 2018-11-… host2    render     STOP       42.99100… 0ad8d2…      129. 
#>  4 2018-11-… host3    upload     STOP       24.68400… 528818…      108. 
#>  5 2018-11-… host2    save       STOP       46.10500… 5c14d7…      129. 
#>  6 2018-11-… host5    save       STOP       44.24899… fa8d57…       85.5
#>  7 2018-11-… host5    render     STOP       41.24400… 0ebfe1…       85.5
#>  8 2018-11-… host3    upload     STOP       49.48000… 403c1c…      108. 
#>  9 2018-11-… host3    upload     STOP       33.70000… ffd698…      108. 
#> 10 2018-11-… host1    render     STOP       49.05500… 70a9ab…       74.9
#> # … with 1 more variable: event_type_Av <dbl>

Created on 2019-01-15 by the reprex package (v0.2.1)

1 Like

#3

If you are only interested in the results, try something like this.

library(dplyr)
data <- data.frame(stringsAsFactors=FALSE,
                   timestamp = c("2018-11-08 07:42:11", "2018-11-08 07:42:55",
                                 "2018-11-08 07:43:41", "2018-11-08 07:44:07",
                                 "2018-11-08 07:44:57", "2018-11-08 07:45:44", "2018-11-08 07:46:28",
                                 "2018-11-08 07:47:20", "2018-11-08 07:47:56", "2018-11-08 07:48:48"),
                   hostname = c("host1",
                                "host2",
                                "host2",
                                "host3", 
                                "host2",
                                "host5",
                                "host5", 
                                "host3",
                                "host3",
                                "host1"),
                   event_name = c("save", "upload", "render", "upload",
                                  "save", "save", "render", "upload",
                                  "upload", "render"),
                   event_type = c("STOP", "STOP", "STOP", "STOP", "STOP", "STOP", "STOP",
                                  "STOP", "STOP", "STOP"),
                   time_task = c(25.8089997768402, 40.319000005722, 42.9910001754761,
                                 24.6840000152588, 46.1050000190735, 44.2489998340607,
                                 41.2440001964569, 49.4800000190735, 33.7000000476837,
                                 49.0550000667572)
)
data %>% 
    group_by(event_name) %>% 
    summarise(mean_time = mean(time_task))
#> # A tibble: 3 x 2
#>   event_name mean_time
#>   <chr>          <dbl>
#> 1 render          44.4
#> 2 save            38.7
#> 3 upload          37.0

data %>% 
    group_by(hostname, event_name) %>% 
    summarise(mean_time = mean(time_task))
#> # A tibble: 8 x 3
#> # Groups:   hostname [?]
#>   hostname event_name mean_time
#>   <chr>    <chr>          <dbl>
#> 1 host1    render          49.1
#> 2 host1    save            25.8
#> 3 host2    render          43.0
#> 4 host2    save            46.1
#> 5 host2    upload          40.3
#> 6 host3    upload          36.0
#> 7 host5    render          41.2
#> 8 host5    save            44.2
1 Like

#4

Many Thanks fo this! Is there away to filter on the highest and the lowest mean from thw column mean_time? Thanks

0 Likes

#5

Like this?

data %>% 
    group_by(hostname, event_name) %>% 
    summarise(mean_time = mean(time_task)) %>%
    ungroup() %>% 
    filter(mean_time == max(mean_time) | mean_time == min(mean_time))
#> # A tibble: 2 x 3
#>   hostname event_name mean_time
#>   <chr>    <chr>          <dbl>
#> 1 host1    render          49.1
#> 2 host1    save            25.8

or lyke this?

data %>% 
    group_by(event_name, hostname) %>% 
    summarise(mean_time = mean(time_task)) %>%
    group_by(event_name) %>% 
    filter(mean_time == max(mean_time) | mean_time == min(mean_time)) %>% 
    arrange(event_name, hostname)
#> # A tibble: 6 x 3
#> # Groups:   event_name [3]
#>   event_name hostname mean_time
#>   <chr>      <chr>        <dbl>
#> 1 render     host1         49.1
#> 2 render     host5         41.2
#> 3 save       host1         25.8
#> 4 save       host2         46.1
#> 5 upload     host2         40.3
#> 6 upload     host3         36.0
1 Like

#6

That is exactly what I am after. Thanks very much.

0 Likes

closed #7

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

0 Likes