Filter between two columns values fails

I want to extract the months (column Month) that fall between the start season month (growstart) and the end season month (growend) for each row.

Example from initiale dataset for country Namibia:

namibia.initial = structure(list(gid = c("100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468", "100468"), xcoord = c(13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75, 13.75), ycoord = c(-20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25, -20.25), col = c("388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388", "388"), row = c("140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140", "140"), gwno = c(565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L), country = c("Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia"), km2 = c(3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089, 3282.64642089), Time = structure(c(3667, 3697, 3727, 3758, 3788, 3819, 3849, 3880, 3911, 3941, 3972, 4002, 4033, 4063, 4092, 4123, 4153, 4184, 4214, 4245), class = "Date"), SPEI1 = c(-1.95947802066803, 0.557283878326416, 1.77989518642426, -1.2029390335083, -0.119278997182846, 1.44610369205475, -1.4578732252121, -1.14002466201782, 1.1647777557373, -1.34318947792053, -0.500527501106262, 1.50793671607971, -1.45792877674103, -2.00679230690002, -1.51340460777283, -1.9636687040329, -1.40127754211426, -0.182968750596046, 0.295145452022552, 0.630711793899536), growstart = c(12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), growend = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4), maincrop = c(52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52), Month = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8), Year = c("1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1981", "1981", "1981", "1981", "1981", "1981", "1981", "1981" )), row.names = c(NA, 20L), class = "data.frame")

I applied the following code:

df = namibia.initial %>% filter( Month>= growstart, Month <= growend)

Results:

namibia.final= structure(list(gid = c("101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187", "101187"), xcoord = c(13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25, 13.25), ycoord = c(-19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75, -19.75), col = c("387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387", "387"), row = c("141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141", "141"), gwno = c(565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L, 565L), country = c("Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia", "Namibia"), km2 = c(3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733, 3272.11329733), Time = structure(c(3667, 3697, 3727, 3758, 3788, 3819, 3849, 3880, 3911, 3941, 3972, 4002, 4033, 4063, 4092, 4123, 4153, 4184, 4214, 4245), class = "Date"), SPEI1 = c(-1.95203363895416, 0.478995203971863, 1.84579658508301, -1.21629953384399, 0.0972747579216957, 2.57254576683044, -1.5848982334137, -1.53128707408905, 1.50004577636719, -1.36633145809174, -0.616199374198914, 1.59283101558685, -1.81799459457397, -2.42788457870483, -1.62756311893463, -1.85774540901184, 0.0972747579216957, -1.97819948196411, 0.301123291254044, 0.321783363819122), growstart = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), growend = c(12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), maincrop = c(25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25), Month = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8), Year = c("1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1980", "1981", "1981", "1981", "1981", "1981", "1981", "1981", "1981")), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L), groups = structure(list( gid = c("101187", "101187"), maincrop = c(25, 25), Year = c("1980", "1981"), .rows = structure(list(1:12, 13:20), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame" ), row.names = c(NA, -2L), .drop = TRUE))

My problem is that when the start season month is higher than the end season month, R does not select the corresponding rows in the final results. For example, main crop 52 does not exist in Namibia.final because the start growing month is 12 and the end growing month is 4.

How can I deal with this issue?

Thank you very much !

Maybe this works for you (assuming you use package dplyr ) :

library(dplyr)
library(magrittr)

namibia <- namibia.initial %>%
  mutate(gs=growstart,ge=growend +12*(growstart>growend)) %>%
  filter( (Month>= gs & Month <= ge) | (Month+12>= gs & Month+12 <= ge))
1 Like

Thank you very much !

I forgot to mention (because of the size I could not past all my dataset) that Namibia it's a very small sample. I have more than 300000 observations with 30 different crops.

So your solution could do the trick if it was the same growing start/end months for each crop.

I found a solution:

df = spei1.crop %>%
group_by(gid, maincrop, Year) %>%
filter( Month== growstart |
Month <= growend)

Hello @Mtrs ,

I don't see how your solution will help you.
Did you try it with a small example dataset ?
Just as I do it here:

suppressPackageStartupMessages(
  suppressWarnings(
    {
      library(dplyr)
      library(magrittr)
    }
  )
)
 
namibia.stylized <-   data.frame(
  crop = c(rep(1,12),rep(2,12)),
  growstart = c(rep(11,12),rep(12,12)),
  growend = c(rep(1,12),rep(2,12)),
  Month = c(1:12,1:12)
)

namibia <- namibia.stylized %>%
  mutate(gs=growstart,ge=growend +12*(growstart>growend)) %>%
  filter( (Month>= gs & Month <= ge) | (Month+12>= gs & Month+12 <= ge))

Created on 2022-06-27 by the reprex package (v2.0.1)

Thanks. I got too keen and thought it worked but you are right.

However, your code does not work as well because we should take into account the year.

In Namibia for year 1981, if the starting season is 12, it mean December from 1981 from April 1982.

I will repost a post that is more clear.

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.