Scores by Year and month - summarise issue

Hi,
I have a simple df I would like to summarise (IntYear in the first column, IntMonth in the next, mean scores of four measures and count in the end).
I have an error and I don't know what is wrong.
Can you help please?

data.source <- data.frame(stringsAsFactors=FALSE,
                          DF.URN = c("C688230", "C688234", "C888832", "C889311", "C889312",
                                     "C889313", "C889320", "C889353", "C889359",
                                     "C989839", "V08152703", "V09159381",
                                     "V11187611", "V11187616", "V12196198", "V12196205",
                                     "V12196208", "CE90195359", "CE90195377",
                                     "CE90195408", "CE90195411", "CE90195413", "CE90195450",
                                     "CE90195455"),
                          Sat_Score = c(90, 100, 90, 80, 100, 100, 100, 90, 90, 100, 80, 70, 90,
                                        100, 90, 100, 50, 100, 100, 90, 100, 70, 100,
                                        100),
                          Rec_Score = c(90, 100, 90, 70, 100, 100, 100, 90, 90, 100, 80, 90, 90,
                                        100, 80, 100, 40, 100, 100, 90, 100, 100,
                                        100, 100),
                          Staff_Score = c(90, 100, 80, 90, 100, 90, 100, 90, 90, 100, 80, 90, 90, 100,
                                          50, 100, 60, 100, 100, 100, 100, 60, 100,
                                          100),
                          Value_Score = c(90, 100, 80, 60, 100, 90, 100, 90, 80, 100, 60, 70, 90, 100,
                                          100, 90, 80, 100, 100, 90, 100, 90, 100,
                                          100),
                          IntDate = c("2017-07-03", "2017-07-04", "2017-08-07", "2017-09-04",
                                      "2017-08-29", "2017-09-01", "2017-08-29",
                                      "2017-09-07", "2017-09-01", "2017-10-02",
                                      "2018-10-12", "2018-10-03", "2018-11-30", "2018-11-30",
                                      "2018-12-31", "2018-12-31", "2018-12-31",
                                      "2019-01-21", "2019-01-23", "2019-02-04",
                                      "2019-01-29", "2019-01-31", "2019-02-04", "2019-02-04"),
                          IntYear = c(2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
                                      2018, 2018, 2018, 2018, 2018, 2018, 2018,
                                      2019, 2019, 2019, 2019, 2019, 2019, 2019),
                          IntMonth = c(7, 7, 8, 9, 8, 9, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12,
                                       1, 1, 2, 1, 1, 2, 2)
)
  
library(dplyr)

Y.M <- data.source %>% 
  group_by(IntYear,IntMonth) %>% 
  summarise(Rec_Score = mean(Rec_Score),
            Sat_Score = mean(Sat_Score, na.rm = TRUE),
            Staff_Score = mean(Staff_Score, na.rm = TRUE)) %>%
  mutate(Rec_Score = n())

Y.M

error:

Error: n() should only be called in a data context
1 Like

I ran your code and did not get an error. I would make sure you restart your R Session (Session > Restart R) to make sure there isn't something loaded in your environment that is messing you up. I am also using dplyr version 0.8.3. If you are looking for the count of rows per year/month you can add n() to your summarise() call. In your example code you are overwriting your mean scores with the count. I believe this is what you are looking for, let me know if this is not what you were looking for! :grin:

data.source <- data.frame(
  stringsAsFactors=FALSE,
  DF.URN = c("C688230", "C688234", "C888832", "C889311", "C889312",
             "C889313", "C889320", "C889353", "C889359",
             "C989839", "V08152703", "V09159381",
             "V11187611", "V11187616", "V12196198", "V12196205",
             "V12196208", "CE90195359", "CE90195377",
             "CE90195408", "CE90195411", "CE90195413", "CE90195450",
             "CE90195455"),
  Sat_Score = c(90, 100, 90, 80, 100, 100, 100, 90, 90, 100, 80, 70, 90,
                100, 90, 100, 50, 100, 100, 90, 100, 70, 100,
                100),
  Rec_Score = c(90, 100, 90, 70, 100, 100, 100, 90, 90, 100, 80, 90, 90,
                100, 80, 100, 40, 100, 100, 90, 100, 100,
                100, 100),
  Staff_Score = c(90, 100, 80, 90, 100, 90, 100, 90, 90, 100, 80, 90, 90, 100,
                  50, 100, 60, 100, 100, 100, 100, 60, 100,
                  100),
  Value_Score = c(90, 100, 80, 60, 100, 90, 100, 90, 80, 100, 60, 70, 90, 100,
                  100, 90, 80, 100, 100, 90, 100, 90, 100,
                  100),
  IntDate = c("2017-07-03", "2017-07-04", "2017-08-07", "2017-09-04",
              "2017-08-29", "2017-09-01", "2017-08-29",
              "2017-09-07", "2017-09-01", "2017-10-02",
              "2018-10-12", "2018-10-03", "2018-11-30", "2018-11-30",
              "2018-12-31", "2018-12-31", "2018-12-31",
              "2019-01-21", "2019-01-23", "2019-02-04",
              "2019-01-29", "2019-01-31", "2019-02-04", "2019-02-04"),
  IntYear = c(2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
              2018, 2018, 2018, 2018, 2018, 2018, 2018,
              2019, 2019, 2019, 2019, 2019, 2019, 2019),
  IntMonth = c(7, 7, 8, 9, 8, 9, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12,
               1, 1, 2, 1, 1, 2, 2)
)

suppressPackageStartupMessages(library(dplyr))

Y.M <- data.source %>% 
  group_by(IntYear,IntMonth) %>% 
  summarise(
    Rec_Score = mean(Rec_Score),
    Sat_Score = mean(Sat_Score, na.rm = TRUE),
    Staff_Score = mean(Staff_Score, na.rm = TRUE),
    count = n()
  )

Y.M
#> # A tibble: 9 x 6
#> # Groups:   IntYear [3]
#>   IntYear IntMonth Rec_Score Sat_Score Staff_Score count
#>     <dbl>    <dbl>     <dbl>     <dbl>       <dbl> <int>
#> 1    2017        7      95        95          95       2
#> 2    2017        8      96.7      96.7        93.3     3
#> 3    2017        9      87.5      90          90       4
#> 4    2017       10     100       100         100       1
#> 5    2018       10      85        75          85       2
#> 6    2018       11      95        95          95       2
#> 7    2018       12      73.3      80          70       3
#> 8    2019        1     100        92.5        90       4
#> 9    2019        2      96.7      96.7       100       3

Created on 2019-12-19 by the reprex package (v0.3.0)

2 Likes

Thank you.

Now I am actually thinking that count of responses might differ in my real data. How can I add count after each mean score? So Rec_Score mean, Rec_Score count, Sat_Score mean, Sat_Score count etc...rather than having just one count in the end of the table?

Do you think there is a way of selecting variables containing "Score" without listing them in the summarise code? My code is not working:

Y.M <- data.source %>% 
  select(IntYear, IntMonth, ends_with("Score")) %>% 
  group_by(IntYear,IntMonth) %>% 
  summarise_all(list(mean(),
    count = n())
  )

I think I understand what you are saying. Since the given data is not independent/tidy (one row per observation), lets use {tidyr} to manipulate the data so there is one score per year/month. After that we can summarize it. Thoughts?

data_source <- data.frame(
  stringsAsFactors=FALSE,
  DF.URN = c("C688230", "C688234", "C888832", "C889311", "C889312",
             "C889313", "C889320", "C889353", "C889359",
             "C989839", "V08152703", "V09159381",
             "V11187611", "V11187616", "V12196198", "V12196205",
             "V12196208", "CE90195359", "CE90195377",
             "CE90195408", "CE90195411", "CE90195413", "CE90195450",
             "CE90195455"),
  Sat_Score = c(90, 100, 90, 80, 100, 100, 100, 90, 90, 100, 80, 70, 90,
                100, 90, 100, 50, 100, 100, 90, 100, 70, 100,
                100),
  Rec_Score = c(90, 100, 90, 70, 100, 100, 100, 90, 90, 100, 80, 90, 90,
                100, 80, 100, 40, 100, 100, 90, 100, 100,
                100, 100),
  Staff_Score = c(90, 100, 80, 90, 100, 90, 100, 90, 90, 100, 80, 90, 90, 100,
                  50, 100, 60, 100, 100, 100, 100, 60, 100,
                  100),
  Value_Score = c(90, 100, 80, 60, 100, 90, 100, 90, 80, 100, 60, 70, 90, 100,
                  100, 90, 80, 100, 100, 90, 100, 90, 100,
                  100),
  IntDate = c("2017-07-03", "2017-07-04", "2017-08-07", "2017-09-04",
              "2017-08-29", "2017-09-01", "2017-08-29",
              "2017-09-07", "2017-09-01", "2017-10-02",
              "2018-10-12", "2018-10-03", "2018-11-30", "2018-11-30",
              "2018-12-31", "2018-12-31", "2018-12-31",
              "2019-01-21", "2019-01-23", "2019-02-04",
              "2019-01-29", "2019-01-31", "2019-02-04", "2019-02-04"),
  IntYear = c(2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
              2018, 2018, 2018, 2018, 2018, 2018, 2018,
              2019, 2019, 2019, 2019, 2019, 2019, 2019),
  IntMonth = c(7, 7, 8, 9, 8, 9, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 12,
               1, 1, 2, 1, 1, 2, 2)
)

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))

long_data <- data_source %>% 
  janitor::clean_names() %>%  ## change column names to snake_case, my preference :)
  pivot_longer(
    cols = ends_with("score"),
    names_to = "type",
    values_to = "score"
)

## Data in long format
long_data
#> # A tibble: 96 x 6
#>    df_urn  int_date   int_year int_month type        score
#>    <chr>   <chr>         <dbl>     <dbl> <chr>       <dbl>
#>  1 C688230 2017-07-03     2017         7 sat_score      90
#>  2 C688230 2017-07-03     2017         7 rec_score      90
#>  3 C688230 2017-07-03     2017         7 staff_score    90
#>  4 C688230 2017-07-03     2017         7 value_score    90
#>  5 C688234 2017-07-04     2017         7 sat_score     100
#>  6 C688234 2017-07-04     2017         7 rec_score     100
#>  7 C688234 2017-07-04     2017         7 staff_score   100
#>  8 C688234 2017-07-04     2017         7 value_score   100
#>  9 C888832 2017-08-07     2017         8 sat_score      90
#> 10 C888832 2017-08-07     2017         8 rec_score      90
#> # … with 86 more rows

summarized_data <- long_data %>% 
  group_by(int_year, int_month, type) %>% 
  summarise(
    avg_score = mean(score),
    count = n()
  )

## Summarized View
summarized_data
#> # A tibble: 36 x 5
#> # Groups:   int_year, int_month [9]
#>    int_year int_month type        avg_score count
#>       <dbl>     <dbl> <chr>           <dbl> <int>
#>  1     2017         7 rec_score        95       2
#>  2     2017         7 sat_score        95       2
#>  3     2017         7 staff_score      95       2
#>  4     2017         7 value_score      95       2
#>  5     2017         8 rec_score        96.7     3
#>  6     2017         8 sat_score        96.7     3
#>  7     2017         8 staff_score      93.3     3
#>  8     2017         8 value_score      93.3     3
#>  9     2017         9 rec_score        87.5     4
#> 10     2017         9 sat_score        90       4
#> # … with 26 more rows

Created on 2019-12-19 by the reprex package (v0.3.0)

Clever solution but it ignores means with missing variables (I used my real, bigger data set with some missing values and means for them are NAs)...

1 Like

Have you tried adding na.rm = TRUE to your code?

2 Likes
summarized_data <- long_data %>%
    group_by(int_year, int_month, type) %>%
    summarise(
        avg_score = mean(score, na.rm = TRUE),
        count = n()
    )

As @andresrcs stated, adding na.rm. = TRUE the code will adjust for the missing values. :grin:

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.