Group_by() does not work

Hi,

I don't know why the function group_by() doesn't work in this case. It doesn't group rows by column 'OBEC'.

Could you help me, please?

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union



rstudio <- data.frame(
  stringsAsFactors = FALSE,
         ID_OKRSKY = c(596, 767, 768, 769, 770, 771, 772, 773, 774, 775),
          TYP_FORM = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
            OPRAVA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
             CHYBA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
             OKRES = c(2101,2101,2101,2101,2101,
                       2101,2101,2101,2101,2101),
              OBEC = c(513482,529303,529303,529303,
                       529303,529303,529303,529303,529303,529303),
            OKRSEK = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9),
              KC_1 = c(0, 5, 6, 0, 1, 2, 3, 4, 5, 6),
        VOL_SEZNAM = c(164, 841, 997, 569, 826, 693, 546, 696, 516, 676),
        VYD_OBALKY = c(81, 158, 336, 183, 272, 261, 205, 247, 195, 212),
        ODEVZ_OBAL = c(81, 158, 334, 183, 272, 261, 205, 247, 195, 210),
        PL_HL_CELK = c(81, 156, 332, 177, 271, 258, 202, 241, 192, 209),
              KC_2 = c(407,1313,1999,1112,1641,
                       1473,1158,1431,1098,1307)
)



rstudio <- rstudio %>% 
  dplyr::group_by(OBEC) %>% 
  dplyr::summarise(ucast = PL_HL_CELK/VOL_SEZNAM)
#> `summarise()` regrouping output by 'OBEC' (override with `.groups` argument)

packageVersion("dplyr")
#> [1] '1.0.1'
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Catalina 10.15.6
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] dplyr_1.0.1
#> 
#> loaded via a namespace (and not attached):
#>  [1] crayon_1.3.4     digest_0.6.25    R6_2.4.1         lifecycle_0.2.0 
#>  [5] magrittr_1.5     evaluate_0.14    pillar_1.4.6     highr_0.8       
#>  [9] rlang_0.4.7      stringi_1.4.6    ellipsis_0.3.1   vctrs_0.3.2     
#> [13] generics_0.0.2   rmarkdown_2.3    tools_4.0.2      stringr_1.4.0   
#> [17] glue_1.4.1       purrr_0.3.4      xfun_0.17        yaml_2.2.1      
#> [21] compiler_4.0.2   pkgconfig_2.0.3  htmltools_0.5.0  tidyselect_1.1.0
#> [25] knitr_1.29       tibble_3.0.3

Thank you!

All you need to do is assign the output of the dplyr function to the rstudio variable.

library(dplyr, warn.conflicts = FALSE)
rstudio <- data.frame(
  stringsAsFactors = FALSE,
  ID_OKRSKY = c(596, 767, 768, 769, 770, 771, 772, 773, 774, 775),
  TYP_FORM = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
  OPRAVA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  CHYBA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  OKRES = c(2101,2101,2101,2101,2101,
            2101,2101,2101,2101,2101),
  OBEC = c(513482,529303,529303,529303,
           529303,529303,529303,529303,529303,529303),
  OKRSEK = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9),
  KC_1 = c(0, 5, 6, 0, 1, 2, 3, 4, 5, 6),
  VOL_SEZNAM = c(164, 841, 997, 569, 826, 693, 546, 696, 516, 676),
  VYD_OBALKY = c(81, 158, 336, 183, 272, 261, 205, 247, 195, 212),
  ODEVZ_OBAL = c(81, 158, 334, 183, 272, 261, 205, 247, 195, 210),
  PL_HL_CELK = c(81, 156, 332, 177, 271, 258, 202, 241, 192, 209),
  KC_2 = c(407,1313,1999,1112,1641,
           1473,1158,1431,1098,1307)
)

rstudio <- rstudio %>% 
  dplyr::group_by(OBEC) %>% 
  dplyr::summarise(ucast = PL_HL_CELK/VOL_SEZNAM)
#> `summarise()` regrouping output by 'OBEC' (override with `.groups` argument)
print(rstudio)
#> # A tibble: 10 x 2
#> # Groups:   OBEC [2]
#>      OBEC ucast
#>     <dbl> <dbl>
#>  1 513482 0.494
#>  2 529303 0.185
#>  3 529303 0.333
#>  4 529303 0.311
#>  5 529303 0.328
#>  6 529303 0.372
#>  7 529303 0.370
#>  8 529303 0.346
#>  9 529303 0.372
#> 10 529303 0.309

Created on 2020-09-20 by the reprex package (v0.3.0)

I'm sorry, I don't know what you mean.

In your code you run the data.frame function but you do not assign the output to a variable. The beginning of the command looks like this

data.frame(
  stringsAsFactors = FALSE,
         ID_OKRSKY = c(596, 767, 768, 769, 770, 771, 772, 773, 774, 775),

In my code, the output of data.frame is assigned to the variable rstudio. The beginning of the command looks like this

rstudio <- data.frame(
  stringsAsFactors = FALSE,
  ID_OKRSKY = c(596, 767, 768, 769, 770, 771, 772, 773, 774, 775),

The following command works in my code because rstudio now contains the data frame.

rstudio <- rstudio %>% 
  dplyr::group_by(OBEC) %>% 
  dplyr::summarise(ucast = PL_HL_CELK/VOL_SEZNAM)

That was just a mistake in creating a reproducible example. It still doesn't work.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

df_paste(rstudio)
#> Error in df_paste(rstudio): could not find function "df_paste"

rstudio <- data.frame(
  stringsAsFactors = FALSE,
         ID_OKRSKY = c(596, 767, 768, 769, 770, 771, 772, 773, 774, 775),
          TYP_FORM = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
            OPRAVA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
             CHYBA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
             OKRES = c(2101,2101,2101,2101,2101,
                       2101,2101,2101,2101,2101),
              OBEC = c(513482,529303,529303,529303,
                       529303,529303,529303,529303,529303,529303),
            OKRSEK = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9),
              KC_1 = c(0, 5, 6, 0, 1, 2, 3, 4, 5, 6),
        VOL_SEZNAM = c(164, 841, 997, 569, 826, 693, 546, 696, 516, 676),
        VYD_OBALKY = c(81, 158, 336, 183, 272, 261, 205, 247, 195, 212),
        ODEVZ_OBAL = c(81, 158, 334, 183, 272, 261, 205, 247, 195, 210),
        PL_HL_CELK = c(81, 156, 332, 177, 271, 258, 202, 241, 192, 209),
              KC_2 = c(407,1313,1999,1112,1641,
                       1473,1158,1431,1098,1307)
)



rstudio <- rstudio %>% 
  dplyr::group_by(OBEC) %>% 
  dplyr::summarise(ucast = PL_HL_CELK/VOL_SEZNAM)
#> `summarise()` regrouping output by 'OBEC' (override with `.groups` argument)

packageVersion("dplyr")
#> [1] '1.0.1'
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Catalina 10.15.6
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] dplyr_1.0.1
#> 
#> loaded via a namespace (and not attached):
#>  [1] crayon_1.3.4     digest_0.6.25    R6_2.4.1         lifecycle_0.2.0 
#>  [5] magrittr_1.5     evaluate_0.14    pillar_1.4.6     highr_0.8       
#>  [9] rlang_0.4.7      stringi_1.4.6    ellipsis_0.3.1   vctrs_0.3.2     
#> [13] generics_0.0.2   rmarkdown_2.3    tools_4.0.2      stringr_1.4.0   
#> [17] glue_1.4.1       purrr_0.3.4      xfun_0.17        yaml_2.2.1      
#> [21] compiler_4.0.2   pkgconfig_2.0.3  htmltools_0.5.0  tidyselect_1.1.0
#> [25] knitr_1.29       tibble_3.0.3

I think its best to step back from code a moment and talk about the context, the starting data, the computation (if you were computing it by hand what process would you follow ?) . what is the desired result.

group by is primarily intended to be a counterpart to summarising functions , functions that collapse several rows into a single row representing the group, this requires a function of an aggregating type.
equality (=) and division(/) are not aggregative,

The above snippet from your code suggests that the summarize worked. Have you inspected the rstudio object that results from that?

Hi,

I want rows with the same value in 'OBEC' to collapse and then divide the column 'PL_HL_CELK' by 'VOL_SEZNAM' and get two rows with columns 'OBEC' and 'ucast'.

Thank you

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union


rstudio <- data.frame(
  stringsAsFactors = FALSE,
         ID_OKRSKY = c(596, 767, 768, 769, 770, 771, 772, 773, 774, 775),
          TYP_FORM = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
            OPRAVA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
             CHYBA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
             OKRES = c(2101,2101,2101,2101,2101,
                       2101,2101,2101,2101,2101),
              OBEC = c(513482,529303,529303,529303,
                       529303,529303,529303,529303,529303,529303),
            OKRSEK = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9),
              KC_1 = c(0, 5, 6, 0, 1, 2, 3, 4, 5, 6),
        VOL_SEZNAM = c(164, 841, 997, 569, 826, 693, 546, 696, 516, 676),
        VYD_OBALKY = c(81, 158, 336, 183, 272, 261, 205, 247, 195, 212),
        ODEVZ_OBAL = c(81, 158, 334, 183, 272, 261, 205, 247, 195, 210),
        PL_HL_CELK = c(81, 156, 332, 177, 271, 258, 202, 241, 192, 209),
              KC_2 = c(407,1313,1999,1112,1641,
                       1473,1158,1431,1098,1307)
)



rstudio %>% 
  dplyr::group_by(OBEC) %>% 
  dplyr::summarise(ucast = PL_HL_CELK/VOL_SEZNAM)
#> `summarise()` regrouping output by 'OBEC' (override with `.groups` argument)
#> # A tibble: 10 x 2
#> # Groups:   OBEC [2]
#>      OBEC ucast
#>     <dbl> <dbl>
#>  1 513482 0.494
#>  2 529303 0.185
#>  3 529303 0.333
#>  4 529303 0.311
#>  5 529303 0.328
#>  6 529303 0.372
#>  7 529303 0.370
#>  8 529303 0.346
#>  9 529303 0.372
#> 10 529303 0.309

Created on 2020-09-21 by the reprex package (v0.3.0)

this is impossible without deciding on a method of aggregation....
This is going to be context dependant, a common or natural one might be 'mean average'.

exdf <- tibble(
  person = c("John","John","Jane"),
  height = c(1.7,1.7,1.6),
  weight = c(160,165,130)
)

exdf %>% group_by(person) %>%
  summarise(height_weight_ration = height/weight)

exdf %>% group_by(person) %>%
  summarise(height_weight_ration = mean(height/weight))
1 Like

Thank you! I understand now.

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.