combine two factor variables to create new strata

Hie,
I am trying to combine a two level variables to create a third variable (strata). The first level is region (north, central, south) the second one is MTYPE of dwelling (1=rural , 2=urban). I wish to combine these so that I have a six level variable (North1, North2, Central1, Central2 or Northrural, Northurban etc).

a snippet of my data

data.frame(stringsAsFactors=FALSE,
                                         Zn_gdl = c(59.375, 56.25, 50, 53.125, 93.75, 68.75),
                                          znadj = c(59.375, 59.05237961148, 50, 55.7716918552867,
                                                    99.3951300710933, 68.75),
                                         status = c("low", "low", "low", "low", "ok", "ok"),
                                            CRP = c(4.97, 19.23, 1.19, 13.29, 5.03, 0.84),
                                            AGP = c(0.65, 1.56, 0.61, 1.07, 0.8, 0.44),
                                         inflst = c("A_norm", "earl", "A_norm", "earl", "incu", "A_norm"),
                                time_blood_draw = c(1, 1, 1, 2, 1, 2),
                                           fast = c(0, 0, 0, 0, 0, 0),
                                         gender = c("M", "M", "M", "M", "M", "M"),
                                          group = c("men", "men", "men", "men", "men", "men"),
                                       MCLUSTER = c(601, 601, 601, 511, 511, 829),
                                        MNUMBER = c(289, 151, 9, 94, 95, 234),
                                            M01 = c(2, 2, 1, 2, 1, 1),
                                    unadjstatus = c("low", "low", "low", "low", "ok", "ok"),
                                            wgt = c(0.995193, 0.995193, 0.995193, 1.117896, 1.117896,
                                                    1.477476),
                                        MREGION = c(2, 2, 3, 2, 1, 3),
                                         region = c("central", "central", "south", "central", "north",
                                                    "south"),
                                         strata = c(NA, NA, NA, NA, NA, NA),
                                          MTYPE = as.factor(c("2", "1", "1", "2", "2", "2"))
                             )

I have spent a few hours trying different functions such as combine.levels and mutate but keep going in circles and not getting what I want. Can anyone help?

I'm a little confused by the use of MTYPE — wouldn't any instances of *1 be equivalent to *rural and *2 to *urban — so if you're adding a new variable, every instance of, say North1, would also be Northrural?

OK, that aside, here are a few options:

suppressPackageStartupMessages(library(tidyverse))
df <- data.frame(stringsAsFactors=FALSE,
           Zn_gdl = c(59.375, 56.25, 50, 53.125, 93.75, 68.75),
           znadj = c(59.375, 59.05237961148, 50, 55.7716918552867,
                     99.3951300710933, 68.75),
           status = c("low", "low", "low", "low", "ok", "ok"),
           CRP = c(4.97, 19.23, 1.19, 13.29, 5.03, 0.84),
           AGP = c(0.65, 1.56, 0.61, 1.07, 0.8, 0.44),
           inflst = c("A_norm", "earl", "A_norm", "earl", "incu", "A_norm"),
           time_blood_draw = c(1, 1, 1, 2, 1, 2),
           fast = c(0, 0, 0, 0, 0, 0),
           gender = c("M", "M", "M", "M", "M", "M"),
           group = c("men", "men", "men", "men", "men", "men"),
           MCLUSTER = c(601, 601, 601, 511, 511, 829),
           MNUMBER = c(289, 151, 9, 94, 95, 234),
           M01 = c(2, 2, 1, 2, 1, 1),
           unadjstatus = c("low", "low", "low", "low", "ok", "ok"),
           wgt = c(0.995193, 0.995193, 0.995193, 1.117896, 1.117896,
                   1.477476),
           MREGION = c(2, 2, 3, 2, 1, 3),
           region = c("central", "central", "south", "central", "north",
                      "south"),
           strata = c(NA, NA, NA, NA, NA, NA),
           MTYPE = as.factor(c("2", "1", "1", "2", "2", "2"))
)

It sounds like you want region to be a factor, so we'll properly turn region into a factor variable. I like forcats, so I'll use forcats::as_factor().

df_mod <- df %>%
  mutate(region = as_factor(region))

Note that you don't have all possible combinations of region and MTYPE in this data. You can cross the levels of factors with forcats::fct_cross(). To keep missing levels, you set keep_empty to TRUE.

fct_cross(df_mod$region, df_mod$MTYPE, keep_empty = TRUE)
#> [1] central:2 central:1 south:1   central:2 north:2   south:2  
#> Levels: central:1 south:1 north:1 central:2 south:2 north:2

We can also set the sep argument, to have the combinations as you wanted them in your example.

df_mod %>%
  mutate("regmtype_cross" = fct_cross(region, MTYPE, keep_empty = TRUE, sep = ""))
#>   Zn_gdl    znadj status   CRP  AGP inflst time_blood_draw fast gender
#> 1 59.375 59.37500    low  4.97 0.65 A_norm               1    0      M
#> 2 56.250 59.05238    low 19.23 1.56   earl               1    0      M
#> 3 50.000 50.00000    low  1.19 0.61 A_norm               1    0      M
#> 4 53.125 55.77169    low 13.29 1.07   earl               2    0      M
#> 5 93.750 99.39513     ok  5.03 0.80   incu               1    0      M
#> 6 68.750 68.75000     ok  0.84 0.44 A_norm               2    0      M
#>   group MCLUSTER MNUMBER M01 unadjstatus      wgt MREGION  region strata
#> 1   men      601     289   2         low 0.995193       2 central     NA
#> 2   men      601     151   2         low 0.995193       2 central     NA
#> 3   men      601       9   1         low 0.995193       3   south     NA
#> 4   men      511      94   2         low 1.117896       2 central     NA
#> 5   men      511      95   1          ok 1.117896       1   north     NA
#> 6   men      829     234   1          ok 1.477476       3   south     NA
#>   MTYPE regmtype_cross
#> 1     2       central2
#> 2     1       central1
#> 3     1         south1
#> 4     2       central2
#> 5     2         north2
#> 6     2         south2


# If you wanted to use words - rural, urban, you can
# use `fct_recode()`
df_mod %>%
  mutate(mtype_words = fct_recode(MTYPE, rural = "1", urban = "2"))
#>   Zn_gdl    znadj status   CRP  AGP inflst time_blood_draw fast gender
#> 1 59.375 59.37500    low  4.97 0.65 A_norm               1    0      M
#> 2 56.250 59.05238    low 19.23 1.56   earl               1    0      M
#> 3 50.000 50.00000    low  1.19 0.61 A_norm               1    0      M
#> 4 53.125 55.77169    low 13.29 1.07   earl               2    0      M
#> 5 93.750 99.39513     ok  5.03 0.80   incu               1    0      M
#> 6 68.750 68.75000     ok  0.84 0.44 A_norm               2    0      M
#>   group MCLUSTER MNUMBER M01 unadjstatus      wgt MREGION  region strata
#> 1   men      601     289   2         low 0.995193       2 central     NA
#> 2   men      601     151   2         low 0.995193       2 central     NA
#> 3   men      601       9   1         low 0.995193       3   south     NA
#> 4   men      511      94   2         low 1.117896       2 central     NA
#> 5   men      511      95   1          ok 1.117896       1   north     NA
#> 6   men      829     234   1          ok 1.477476       3   south     NA
#>   MTYPE mtype_words
#> 1     2       urban
#> 2     1       rural
#> 3     1       rural
#> 4     2       urban
#> 5     2       urban
#> 6     2       urban

Created on 2019-08-21 by the reprex package (v0.3.0)

1 Like

yes it's the same, it's why I wrote either North1 or Northrural, I can easily change between the two.

I tried your, suggestion and it works. Thanks a lot :slight_smile:

1 Like

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.