create column indicating various conditions

I have this database

# A tibble: 15 x 7
      id   age  demo result category nationality sport  
   <int> <dbl> <dbl> <chr>  <chr>    <chr>       <chr>  
 1     1    16     1 POS    A        Usa         soccer 
 2     2    19     1 POS    A        Spain       soccer 
 3     3    16     1 NEG    C        Spain       basquet
 4     4    17     1 NEG    A        Brasil      golf   
 5     5    17     1 NEG    B        Spain       voley  
 6     6    17     1 POS    C        Usa         soccer 
 7     7    17     1 POS    A        Brasil      voley  
 8     8    16     1 POS    A        Spain       soccer 
 9     9    16     1 POS    A        France      soccer 
10    10    19     1 NEG    B        Brasil      golf   
11    11    17     1 POS    A        Brasil      golf   
12    12    16     0 NEG    A        Brasil      voley  
13    13    16     0 NEG    B        France      basquet
14    14    16     0 NEG    B        Usa         basquet
15    15    20     0 NEG    A        France      basquet

and I need to create a column " exclusion" that has a value of 1 when the observations accomplish certain conditions and 0 when they do not. the conditions are: have an age under 18, a demo equal to 1, a result equal to NEG and sport equal to soccer. fulfilling one of the conditions already acquires a value of 1 in the column exclusio . The complicated thing is that I also want to create a column "reason" that indicates what condition the observation fulfills in order to have obtained a value of 1 in the created column. the final table I would like would be like this:

id age demo result category nationality sport exclusion reason
1 16 1 POS A Usa soccer 1 under 18, demo equal 1, chose soccer
2 19 1 POS A Spain soccer 1 demo equal 1, chose soccer
3 21 0 POS C Spain basquet
4 20 1 POS A Brasil golf 1 demo equal 1
5 17 0 NEG B Spain voley 1 under 18, resul negative
6 19 0 POS C Usa voley
7 17 0 POS A Brasil voley 1 under 18, chose soccer
8 16 1 POS A Spain soccer 1 under 18, demo equal 1, chose soccer
9 19 0 POS A France golf
10 19 0 POS B Brasil soccer 1 chose soccer
11 21 0 POS A Brasil golf
12 20 0 POS A Brasil voley
13 20 0 POS B France basquet
14 16 1 POS B Usa soccer 1 under 18, demo equal 1, chose soccer
15 20 0 NEG A France soccer 1 resul negative, chose soccer
1 Like

I would suggest creating a new column for each condition, containing the corresponding reason, then combining these columns, and removing the leftover. Something like:

library(tidyverse)

df <- read.table(text = "     id   age  demo result category nationality sport  
 1     1    16     1 POS    A        Usa         soccer 
 2     2    19     1 POS    A        Spain       soccer 
 3     3    16     1 NEG    C        Spain       basquet
 4     4    17     1 NEG    A        Brasil      golf   
 5     5    17     1 NEG    B        Spain       voley  
 6     6    17     1 POS    C        Usa         soccer 
 7     7    17     1 POS    A        Brasil      voley  
 8     8    16     1 POS    A        Spain       soccer 
 9     9    16     1 POS    A        France      soccer 
10    10    19     1 NEG    B        Brasil      golf   
11    11    17     1 POS    A        Brasil      golf   
12    12    16     0 NEG    A        Brasil      voley  
13    13    16     0 NEG    B        France      basquet
14    14    16     0 NEG    B        Usa         basquet
15    15    20     0 NEG    A        France      basquet",
header = TRUE) |>
  as_tibble()


mypaste <- function(s){
  s <- s[! (nchar(s) == 0 | is.na(s))]
  
  paste0(s, collapse = ", ")
}


df |>
  mutate(reas_age = if_else(age < 18, "under 18", ""),
         reas_demo = if_else(demo == 1, "demo equal 1", ""),
         reas_result = if_else(result == "NEG" & sport == "soccer", "chose soccer", "")) |>
  rowwise() |>
  mutate(exclusion = 1*(sum(nchar(c_across(starts_with("reas_")))) > 0),
         reason = if_else(exclusion == 1,
                          mypaste(c_across(starts_with("reas_"))),
                          "")) |>
  select(-starts_with("reas_")) |>
  ungroup()
#> # A tibble: 15 × 9
#>       id   age  demo result category nationality sport   exclusion reason       
#>    <int> <int> <int> <chr>  <chr>    <chr>       <chr>       <dbl> <chr>        
#>  1     1    16     1 POS    A        Usa         soccer          1 "under 18, d…
#>  2     2    19     1 POS    A        Spain       soccer          1 "demo equal …
#>  3     3    16     1 NEG    C        Spain       basquet         1 "under 18, d…
#>  4     4    17     1 NEG    A        Brasil      golf            1 "under 18, d…
#>  5     5    17     1 NEG    B        Spain       voley           1 "under 18, d…
#>  6     6    17     1 POS    C        Usa         soccer          1 "under 18, d…
#>  7     7    17     1 POS    A        Brasil      voley           1 "under 18, d…
#>  8     8    16     1 POS    A        Spain       soccer          1 "under 18, d…
#>  9     9    16     1 POS    A        France      soccer          1 "under 18, d…
#> 10    10    19     1 NEG    B        Brasil      golf            1 "demo equal …
#> 11    11    17     1 POS    A        Brasil      golf            1 "under 18, d…
#> 12    12    16     0 NEG    A        Brasil      voley           1 "under 18"   
#> 13    13    16     0 NEG    B        France      basquet         1 "under 18"   
#> 14    14    16     0 NEG    B        Usa         basquet         1 "under 18"   
#> 15    15    20     0 NEG    A        France      basquet         0 ""

Created on 2022-07-12 by the reprex package (v2.0.1)

2 Likes

This problem comes up frequently. There is a simpler solution.

No need to make multiple columns and merge them. Instead, the same column can be used for multiple ifelse conditions.

The key is to make the column you are editing the 'else' condition, that way if the condition is not met, it returns the existing value. That way, you can build ifelse conditions on top of each other and not overwrite them.

Given a data frame with 'state' and 'year,' I will make a 'government' column and add the appropriate head of government:

library(magrittr)  # To use the %>% pipe.

# Create the data for demonstration:
data <- dplyr::tibble(
  expand.grid(
    c(1999:2022), 
    c('Algeria', 'Morocco'))       
) %>% 
  dplyr::rename(year  = Var1,  state = Var2)

# Use the same column you are creating as the 'else' condition. 
# Below, I  make 'government,' then use 'government' as the else condition.:
data %>% 
  dplyr::mutate(government = ifelse(year < 2019 & state == 'Algeria', 'Bouteflika', 'NA')
  ) %>% 
  dplyr::mutate(government = ifelse(year >= 2019 & state == 'Algeria', 'Tebboune', government)
  )%>% 
  dplyr::mutate(government = ifelse(year >= 1999 & state == 'Morocco', 'Mohammed VI', government))


Good solution in general, but note that here juandmaz added a twist: keeping a column with the reason for exclusion.

Using only if_else() is convenient to decide whether to exclude a row, and it's also practical if you want to create a column with a single reason for exclusion; but to combine all the reasons into a single string that means you're starting to have to think about the correct place of commas etc. Much easier to let paste() decide for you.

So here making intermediary columns seems hard to avoid, the other approach that would still be convenient would be a list-column, so you can still call paste() on it, but that feels less intuitive.

2 Likes

Although inconsequential at this point, I'd suggest keeping the data consistent; both the "database" and the desired outcome are not aligned.

The following uses base R and data.table.

library(data.table)

# juandmaz's "database"
dt_1 <- data.table(
  id = c(1:15),
  age = c(16, 19, 16, 17, 17,
          17, 17, 16, 16, 19,
          17, 16, 16, 16, 20),
  demo = c(rep(1, 11), rep(0, 4)),
  result = c("POS", "POS", "NEG", "NEG", "NEG",
             "POS", "POS", "POS", "POS", "NEG",
             "POS", "NEG", "NEG", "NEG", "NEG"),
  category = c("A", "A", "C", "A", "B",
               "C", "A", "A", "A", "B",
               "A", "A", "B", "B", "A"),
  nationality = c("Usa", "Spain", "Spain", "Brasil", "Spain",
                  "Usa", "Brasil", "Spain", "France", "Brasil",
                  "Brasil", "Brasil", "France", "Usa", "France"),
  sport = c("soccer", "soccer", "basquet", "golf", "voley",
            "soccer", "voley", "soccer", "soccer", "golf",
            "golf", "voley", "basquet", "basquet", "basquet")
)

# juandmaz's reference data
dt_2 <- data.table(
  id = c(1:15),
  age = c(16, 19, 21, 20, 17,
          19, 17, 16, 19, 19,
          21, 20, 20, 16, 20),
  demo = c(1, 1, 0, 1, 0,
           0, 0, 1, 0, 0,
           0, 0, 0, 1, 0),
  result = c("POS", "POS", "POS", "POS", "NEG",
             "POS", "POS", "POS", "POS", "POS",
             "POS", "POS", "POS", "POS", "NEG"),
  category = c("A", "A", "C", "A", "B",
               "C", "A", "A", "A", "B",
               "A", "A", "B", "B", "A"),
  nationality = c("Usa", "Spain", "Spain", "Brasil", "Spain",
                  "Usa", "Brasil", "Spain", "France", "Brasil",
                  "Brasil", "Brasil", "France", "Usa", "France"),
  sport = c("soccer", "soccer", "basquet", "golf", "voley",
            "voley", "voley", "soccer", "golf", "soccer",
            "golf", "voley", "basquet", "soccer", "soccer")
)

# Inconsistancies in references (omitting [exclusion] and [reason])
dt_1 == dt_2

# Rules for [exclusion] and [reason]
# 1. [age] < 18
# 2. [demo] == 1
# 3. [result] == "NEG"
# 4. [sport] == "soccer"

# Seperate Columns for reference
# dt_2[age < 18, i.age := "under 18"][demo == 1, i.demo := "demo equal 1"][result == "NEG", i.result := "resul negative"][sport == "soccer", i.sport := "chose soccer"][]

# Combine and clean (getting rid of tailing space and ",")
dt_2[
  age < 18 | demo == 1 | result == "NEG" | sport == "soccer", exclusion := 1
     ][
       , reason := gsub(
         "NA, |, $|^.\\s+",
         "",
         paste0(
           ifelse(age < 18, "under 18, ", ""),
           ifelse(demo == 1, "demo equal 1, ", ""),
           ifelse(result == "NEG", "resul negative, ", ""),
           ifelse(sport == "soccer", "chose soccer, ", ""
           )
         )
       )
     ]
1 Like

Good point. The ifelse conditions would need to account for all the different combinations of exclusion.

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.