Create new data frame with first instance for each country

Hi I have a data frame that contains covid infection and mobility rates. I want to create a new data frame with the first instance for each country where mobility (frequency) falls below 70%

I'm sure this is quite easy but a little stumped at the moment.
Thanks for the help

is mobility(frequency) a percentage, so that 106.1033 represents 106% or do you mean 70% of some other variable?

also, its much more useful to transfer your example frame with dput(head(mydf,n=10)) , as then we have direct access to it.

Hi NoodleJoe,

I agree with nirgrahamuk, a dput in a reprex would help us a lot.

Using the artificial example below, would this piped approach work for you?

library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

data <- tribble(
  ~country, ~date, ~frequency,
  "Australia", "2020-01-22", 107.1,
  "Australia", '2020-01-23', 69.2,
  "Australia", '2020-01-24', 66.1,
  "Australia", '2020-01-25', 59.1,
  "Belgium", '2020-01-22', 107.1,
  "Belgium", '2020-01-23', 103.9,
  "Belgium", '2020-01-24', 69.1,
  "Belgium", '2020-01-25', 60.1,
  "Brazil", '2020-01-22', 111.1,
  "Brazil", '2020-01-23', 99.8,
  "Brazil", '2020-01-24', 67.8,
  "Brazil", '2020-01-24', 63.8
)


data %>% 
  mutate(date = ymd(date)) %>% 
  filter(frequency<70) %>% 
  group_by(country) %>% 
  arrange(country, date) %>% 
  top_n(1)
#> Selecting by frequency
#> # A tibble: 3 x 3
#> # Groups:   country [3]
#>   country   date       frequency
#>   <chr>     <date>         <dbl>
#> 1 Australia 2020-01-23      69.2
#> 2 Belgium   2020-01-24      69.1
#> 3 Brazil    2020-01-24      67.8

Created on 2020-05-26 by the reprex package (v0.3.0)

Hi, yes sorry I'm not sure how to upload my data frame, can you explain this to make it easier in the future

does your dataframe have a name ?

Thank's Higgins. It didn't quite work. The values returned don't seem to be the first instance. I'm sure if I could upload my df it would be a breeze.

Yes its CovidJoin1. Thanks for your advice so far

in the console, write the following

dput( head( CovidJoin1, n = 10 ) )

the text that gets printed should be pasted into your message.
before you paste it, make an empty row and place 3 backtick symbols in it, to format what you will paste as code.

#  ```
structure(list(Country.Region = c("Australia", "Belgium", "Brazil", 
"Canada", "Denmark", "Estonia", "Finland", "France", "Germany", 
"Ireland"), Dates = structure(c(18283, 18283, 18283, 18283, 18283, 
18283, 18283, 18283, 18283, 18283), class = "Date"), Confirmed = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), Dead = c(0, 0, 0, 0, 0, 0, 0, 0, 
0, 0), Recovered = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Frequency = c(106.103333333333, 
114.663333333333, 102.103333333333, 106.91, 105.13, 105.016666666667, 
101.226666666667, 96.9466666666667, 101.236666666667, 118.25), 
    CurrentConfirmed = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(9L, 
17L, 24L, 33L, 47L, 57L, 61L, 62L, 66L, 83L), class = "data.frame")

Here is a bigger sample

structure(list(Country.Region = c("Australia", "Belgium", "Brazil", 
"Canada", "Denmark", "Estonia", "Finland", "France", "Germany", 
"Ireland", "Italy", "Japan", "Luxembourg", "Mexico", "Netherlands", 
"New Zealand", "Norway", "Philippines", "Singapore", "Slovakia", 
"Spain", "Sweden", "Switzerland", "Australia", "Belgium", "Brazil", 
"Canada", "Denmark", "Estonia", "Finland", "France", "Germany", 
"Ireland", "Italy", "Japan", "Luxembourg", "Mexico", "Netherlands", 
"New Zealand", "Norway"), Dates = structure(c(18283, 18283, 18283, 
18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 
18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 
18283, 18283, 18284, 18284, 18284, 18284, 18284, 18284, 18284, 
18284, 18284, 18284, 18284, 18284, 18284, 18284, 18284, 18284, 
18284), class = "Date"), Confirmed = c(0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0), Dead = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Recovered = c(0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Frequency = c(106.103333333333, 
114.663333333333, 102.103333333333, 106.91, 105.13, 105.016666666667, 
101.226666666667, 96.9466666666667, 101.236666666667, 118.25, 
103.97, 99.3833333333333, 99.9133333333333, 99.5633333333333, 
104.446666666667, 106.706666666667, 107.436666666667, 107.83, 
99.3966666666667, 101.626666666667, 104.68, 110, 112.623333333333, 
111.563333333333, 112.77, 98.9666666666667, 109.97, 105.696666666667, 
110.033333333333, 102.693333333333, 97.6133333333333, 103.426666666667, 
120.69, 106.393333333333, 105.89, 100.43, 103.273333333333, 107.513333333333, 
114.676666666667, 108.356666666667), CurrentConfirmed = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0)), row.names = c(9L, 
17L, 24L, 33L, 47L, 57L, 61L, 62L, 66L, 83L, 85L, 87L, 102L, 
111L, 121L, 122L, 127L, 134L, 151L, 152L, 157L, 161L, 162L, 194L, 
202L, 209L, 218L, 232L, 242L, 246L, 247L, 251L, 268L, 270L, 272L, 
287L, 296L, 306L, 307L, 312L), class = "data.frame")

library(tidyverse)
CovidJoin1 %>% filter(Frequency<=70)  %>%
group_by(Country.Region) %>% mutate(rn=row_number(Dates))  %>%
  filter(rn==1)
1 Like

You are a bloody national treasure, Thank you so much..

Note that this does not quite work with this particular subset, as there are no frequencies <= 70.

But if you set your filter to 105, it works fine.
Slightly shorter version here:

library(tidyverse)

CovidJoin1 <- structure(list(Country.Region = c("Australia", "Belgium", "Brazil", 
                                  "Canada", "Denmark", "Estonia", "Finland", "France", "Germany", 
                                  "Ireland", "Italy", "Japan", "Luxembourg", "Mexico", "Netherlands", 
                                  "New Zealand", "Norway", "Philippines", "Singapore", "Slovakia", 
                                  "Spain", "Sweden", "Switzerland", "Australia", "Belgium", "Brazil", 
                                  "Canada", "Denmark", "Estonia", "Finland", "France", "Germany", 
                                  "Ireland", "Italy", "Japan", "Luxembourg", "Mexico", "Netherlands", 
                                  "New Zealand", "Norway"), Dates = structure(c(18283, 18283, 18283, 
                                                                                18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 
                                                                                18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 
                                                                                18283, 18283, 18284, 18284, 18284, 18284, 18284, 18284, 18284, 
                                                                                18284, 18284, 18284, 18284, 18284, 18284, 18284, 18284, 18284, 
                                                                                18284), class = "Date"), Confirmed = c(0, 0, 0, 0, 0, 0, 0, 0, 
                                                                                                                       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                                                                                                                       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0), Dead = c(0, 0, 0, 0, 0, 0, 
                                                                                                                                                                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                                                                                                                                                                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Recovered = c(0, 0, 0, 
                                                                                                                                                                                                                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                                                                                                                                                                                                                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Frequency = c(106.103333333333, 
                                                                                                                                                                                                                                                                                       114.663333333333, 102.103333333333, 106.91, 105.13, 105.016666666667, 
                                                                                                                                                                                                                                                                                       101.226666666667, 96.9466666666667, 101.236666666667, 118.25, 
                                                                                                                                                                                                                                                                                       103.97, 99.3833333333333, 99.9133333333333, 99.5633333333333, 
                                                                                                                                                                                                                                                                                       104.446666666667, 106.706666666667, 107.436666666667, 107.83, 
                                                                                                                                                                                                                                                                                       99.3966666666667, 101.626666666667, 104.68, 110, 112.623333333333, 
                                                                                                                                                                                                                                                                                       111.563333333333, 112.77, 98.9666666666667, 109.97, 105.696666666667, 
                                                                                                                                                                                                                                                                                       110.033333333333, 102.693333333333, 97.6133333333333, 103.426666666667, 
                                                                                                                                                                                                                                                                                       120.69, 106.393333333333, 105.89, 100.43, 103.273333333333, 107.513333333333, 
                                                                                                                                                                                                                                                                                       114.676666666667, 108.356666666667), CurrentConfirmed = c(0, 
                                                                                                                                                                                                                                                                                                                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                                                                                                                                                                                                                                                                                                                                                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0)), row.names = c(9L, 
                                                                                                                                                                                                                                                                                                                                                                                                                       17L, 24L, 33L, 47L, 57L, 61L, 62L, 66L, 83L, 85L, 87L, 102L, 
                                                                                                                                                                                                                                                                                                                                                                                                                       111L, 121L, 122L, 127L, 134L, 151L, 152L, 157L, 161L, 162L, 194L, 
                                                                                                                                                                                                                                                                                                                                                                                                                       202L, 209L, 218L, 232L, 242L, 246L, 247L, 251L, 268L, 270L, 272L, 
                                                                                                                                                                                                                                                                                                                                                                                                                       287L, 296L, 306L, 307L, 312L), class = "data.frame")


CovidJoin1 %>% 
  filter(Frequency<=105)  %>%
  group_by(Country.Region, Dates) %>% 
  top_n(1)
#> Selecting by CurrentConfirmed
#> # A tibble: 18 x 7
#> # Groups:   Country.Region, Dates [18]
#>    Country.Region Dates      Confirmed  Dead Recovered Frequency
#>    <chr>          <date>         <dbl> <dbl>     <dbl>     <dbl>
#>  1 Brazil         2020-01-22         0     0         0     102. 
#>  2 Finland        2020-01-22         0     0         0     101. 
#>  3 France         2020-01-22         0     0         0      96.9
#>  4 Germany        2020-01-22         0     0         0     101. 
#>  5 Italy          2020-01-22         0     0         0     104. 
#>  6 Japan          2020-01-22         2     0         0      99.4
#>  7 Luxembourg     2020-01-22         0     0         0      99.9
#>  8 Mexico         2020-01-22         0     0         0      99.6
#>  9 Netherlands    2020-01-22         0     0         0     104. 
#> 10 Singapore      2020-01-22         0     0         0      99.4
#> 11 Slovakia       2020-01-22         0     0         0     102. 
#> 12 Spain          2020-01-22         0     0         0     105. 
#> 13 Brazil         2020-01-23         0     0         0      99.0
#> 14 Finland        2020-01-23         0     0         0     103. 
#> 15 France         2020-01-23         0     0         0      97.6
#> 16 Germany        2020-01-23         0     0         0     103. 
#> 17 Luxembourg     2020-01-23         0     0         0     100. 
#> 18 Mexico         2020-01-23         0     0         0     103. 
#> # … with 1 more variable: CurrentConfirmed <dbl>

Created on 2020-05-26 by the reprex package (v0.3.0)

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.