data wrangling when columns are to be treated as row value

AbhishekHP · March 28, 2019, 4:25pm

Hello Data Wranglers,

Lets consider the dataset as below:

df <- data.frame( AA = runif(10), AB = runif(10), AC = runif(10),
                  BA = runif(10), BB = runif(10), BC = runif(10),
                  top1 = c("AA","AB","AA","AA","AA","AA","AA","AA","AA","AC"),
                  top2 = c("BA","BC","AC","BB","AB","AB","BA","BB","AC","AB"),#"AC.4)","AC.8)","AC.7)"),
                  top3 = c("BB","AA","BA","BA","BC","BA","BB","BA","BA","AB")#AB(.4)","AB(.8)","AA(.7)")
)

Expected output would be additional 6 columns (datatype logical)
df <- data.frame( AA = runif(10), AB = runif(10), AC = runif(10),
                  BA = runif(10), BB = runif(10), BC = runif(10),
                  top1 = c("AA","AB","AA","AA","AA","AA","AA","AA","AA","AC"),
                  top2 = c("BA","BC","AC","BB","AB","AB","BA","BB","AC","AB"),
                  top3 = c("BB","AA","BA","BA","BC","BA","BB","BA","BA","AB"),
                  isAA = True if column name "AA" is present in any of the columns starting with "top" i.e top1,top2,top3 else False
                  isAB = True if column name "AB" is present in any of the columns starting with "top" i.e top1,top2,top3 else False
                  ........
                  isBC = True if column name "BC" is present in any of the columns starting with "top" i.e top1,top2,top3 else False
                  )

How can we use mutate_at in this case, not sure how to use the mutate_at to look at few columns only. **Especially how can we compare this row wise

I tried this but doesn't produce expected result because it doesnt check row wise and based on column name

mylist1 <- colnames(df)
df$isAA <- ifelse(df$top1 %in% mylist,T,F)

Even small hint would help, Champ

FJCC · March 28, 2019, 5:44pm

This is what I came up with.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
df <- data.frame( AA = runif(10), AB = runif(10), AC = runif(10),
                  BA = runif(10), BB = runif(10), BC = runif(10),
                  top1 = c("AA","AB","AA","AA","AA","AA","AA","AA","AA","AC"),
                  top2 = c("BA","BC","AC","BB","AB","AB","BA","BB","AC","AB"),
                  top3 = c("BB","AA","BA","BA","BC","BA","BB","BA","BA","AB"),
                  stringsAsFactors = FALSE
)


df_Logical <- df %>% rowwise() %>% 
     do(isAA = "AA" %in% c(.$top1, .$top2, .$top3),
        isAB = "AB" %in% c(.$top1, .$top2, .$top3),
        isAC = "AC" %in% c(.$top1, .$top2, .$top3),
        isBA = "BA" %in% c(.$top1, .$top2, .$top3),
        isBB = "BB" %in% c(.$top1, .$top2, .$top3),
        isBC = "BC" %in% c(.$top1, .$top2, .$top3))

df <- cbind(df, df_Logical)
df[, 10:15]
#>     isAA  isAB  isAC  isBA  isBB  isBC
#> 1   TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 2   TRUE  TRUE FALSE FALSE FALSE  TRUE
#> 3   TRUE FALSE  TRUE  TRUE FALSE FALSE
#> 4   TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 5   TRUE  TRUE FALSE FALSE FALSE  TRUE
#> 6   TRUE  TRUE FALSE  TRUE FALSE FALSE
#> 7   TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 8   TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 9   TRUE FALSE  TRUE  TRUE FALSE FALSE
#> 10 FALSE  TRUE  TRUE FALSE FALSE FALSE

^{Created on 2019-03-28 by the reprex package (v0.2.1)}

Yarnabrina · March 28, 2019, 6:11pm

Almost negligible modification of FJCC's solution:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

df <- data.frame(AA = runif(n = 10),
                 AB = runif(n = 10),
                 AC = runif(n = 10),
                 BA = runif(n = 10),
                 BB = runif(n = 10),
                 BC = runif(n = 10),
                 top1 = c("AA", "AB", "AA", "AA", "AA", "AA", "AA", "AA", "AA", "AC"),
                 top2 = c("BA", "BC", "AC", "BB", "AB", "AB", "BA", "BB", "AC", "AB"),
                 top3 = c("BB", "AA", "BA", "BA", "BC", "BA", "BB", "BA", "BA", "AB"),
                 stringsAsFactors = FALSE)

df %>%
  rowwise() %>%
  mutate(isAA = "AA" %in% c(top1, top2, top3),
         isAB = "AB" %in% c(top1, top2, top3),
         isAC = "AC" %in% c(top1, top2, top3),
         isBA = "BA" %in% c(top1, top2, top3),
         isBB = "BB" %in% c(top1, top2, top3),
         isBC = "BC" %in% c(top1, top2, top3)) %>%
  as.data.frame()
#>            AA          AB         AC        BA         BB        BC top1
#> 1  0.20445046 0.119937260 0.08083993 0.4063985 0.75617881 0.7209522   AA
#> 2  0.99092620 0.624576405 0.18874306 0.4154909 0.68194900 0.1995511   AB
#> 3  0.38123060 0.345235928 0.87227311 0.0737821 0.40427905 0.3377539   AA
#> 4  0.24713706 0.965764469 0.75686828 0.6805222 0.93083331 0.7541616   AA
#> 5  0.26810480 0.003466556 0.42967159 0.3595230 0.82441068 0.5997017   AA
#> 6  0.27804662 0.453115983 0.77424947 0.6936131 0.05298007 0.9311033   AA
#> 7  0.03854091 0.471186985 0.39915372 0.6932790 0.44143088 0.5764811   AA
#> 8  0.27911416 0.168700268 0.49020830 0.4326977 0.51218137 0.4699371   AA
#> 9  0.49350713 0.684234354 0.94819395 0.6529040 0.57309097 0.2536980   AA
#> 10 0.94400907 0.358976069 0.02968900 0.4295179 0.94884435 0.3073917   AC
#>    top2 top3  isAA  isAB  isAC  isBA  isBB  isBC
#> 1    BA   BB  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 2    BC   AA  TRUE  TRUE FALSE FALSE FALSE  TRUE
#> 3    AC   BA  TRUE FALSE  TRUE  TRUE FALSE FALSE
#> 4    BB   BA  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 5    AB   BC  TRUE  TRUE FALSE FALSE FALSE  TRUE
#> 6    AB   BA  TRUE  TRUE FALSE  TRUE FALSE FALSE
#> 7    BA   BB  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 8    BB   BA  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 9    AC   BA  TRUE FALSE  TRUE  TRUE FALSE FALSE
#> 10   AB   AB FALSE  TRUE  TRUE FALSE FALSE FALSE

^{Created on 2019-03-29 by the reprex package (v0.2.1)}

I was hoping to get a better solution using starts_with, but failed. I hope Andres or someone else will provide that solution.

It'd be nicer if all the new columns are defined simultaneously.

andresrcs · March 28, 2019, 8:01pm

Since the only variables containing this character values start with "top" you can do this

library(dplyr)
df <- data.frame( AA = runif(10), AB = runif(10), AC = runif(10),
                  BA = runif(10), BB = runif(10), BC = runif(10),
                  top1 = c("AA","AB","AA","AA","AA","AA","AA","AA","AA","AC"),
                  top2 = c("BA","BC","AC","BB","AB","AB","BA","BB","AC","AB"),
                  top3 = c("BB","AA","BA","BA","BC","BA","BB","BA","BA","AB")
)

df %>% 
    mutate(isAA = if_else(rowSums(df == "AA") > 0, TRUE, FALSE),
           isAB = if_else(rowSums(df == "AB") > 0, TRUE, FALSE),
           isAC = if_else(rowSums(df == "AC") > 0, TRUE, FALSE),
           isBA = if_else(rowSums(df == "BA") > 0, TRUE, FALSE),
           isBB = if_else(rowSums(df == "BB") > 0, TRUE, FALSE),
           isBC = if_else(rowSums(df == "BC") > 0, TRUE, FALSE))
#>           AA         AB        AC          BA        BB         BC top1
#> 1  0.6742042 0.29451579 0.3075144 0.591529769 0.9765826 0.68425233   AA
#> 2  0.4490924 0.15038425 0.4767288 0.115550275 0.1599306 0.77917316   AB
#> 3  0.8451983 0.69324689 0.2861632 0.423071244 0.1690436 0.66394665   AA
#> 4  0.8485679 0.30741232 0.2709162 0.947064351 0.7026782 0.16182432   AA
#> 5  0.8637762 0.93521818 0.8366696 0.006317084 0.6648255 0.83341105   AA
#> 6  0.9858766 0.18817195 0.2042428 0.195960124 0.2424668 0.00549368   AA
#> 7  0.6119642 0.06430763 0.3012005 0.368583605 0.0419986 0.89208593   AA
#> 8  0.7708253 0.12529113 0.8712155 0.243627936 0.3246351 0.95558587   AA
#> 9  0.1446842 0.72507692 0.9396525 0.401439661 0.5583318 0.10795227   AA
#> 10 0.5119219 0.34904416 0.6956383 0.044357371 0.9120155 0.59692495   AC
#>    top2 top3  isAA  isAB  isAC  isBA  isBB  isBC
#> 1    BA   BB  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 2    BC   AA  TRUE  TRUE FALSE FALSE FALSE  TRUE
#> 3    AC   BA  TRUE FALSE  TRUE  TRUE FALSE FALSE
#> 4    BB   BA  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 5    AB   BC  TRUE  TRUE FALSE FALSE FALSE  TRUE
#> 6    AB   BA  TRUE  TRUE FALSE  TRUE FALSE FALSE
#> 7    BA   BB  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 8    BB   BA  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 9    AC   BA  TRUE FALSE  TRUE  TRUE FALSE FALSE
#> 10   AB   AB FALSE  TRUE  TRUE FALSE FALSE FALSE

If you need to specifically select the columns then you can do this

df %>% 
  mutate(isAA = if_else(rowSums(df %>% select(starts_with("top")) == "AA") > 0, TRUE, FALSE)

andresrcs · March 28, 2019, 8:56pm

If you dont know the unique values in advance (i.e. "AA", "AB", "AC", etc) you can do this

library(dplyr)
library(purrr)
df <- data.frame( AA = runif(10), AB = runif(10), AC = runif(10),
                  BA = runif(10), BB = runif(10), BC = runif(10),
                  top1 = c("AA","AB","AA","AA","AA","AA","AA","AA","AA","AC"),
                  top2 = c("BA","BC","AC","BB","AB","AB","BA","BB","AC","AB"),
                  top3 = c("BB","AA","BA","BA","BC","BA","BB","BA","BA","AB"),
                  stringsAsFactors = FALSE
)

df %>% 
  cbind(map_dfc(df %>%
                  select(starts_with("top")) %>%
                  unlist(use.names = FALSE) %>%
                  unique(), 
                ~df %>%
                  select(starts_with("top")) %>% 
                  transmute(!!paste0("is", .x) := if_else(rowSums(df == .x) > 0, TRUE, FALSE))
  ))
#>           AA         AB        AC          BA        BB         BC top1
#> 1  0.6742042 0.29451579 0.3075144 0.591529769 0.9765826 0.68425233   AA
#> 2  0.4490924 0.15038425 0.4767288 0.115550275 0.1599306 0.77917316   AB
#> 3  0.8451983 0.69324689 0.2861632 0.423071244 0.1690436 0.66394665   AA
#> 4  0.8485679 0.30741232 0.2709162 0.947064351 0.7026782 0.16182432   AA
#> 5  0.8637762 0.93521818 0.8366696 0.006317084 0.6648255 0.83341105   AA
#> 6  0.9858766 0.18817195 0.2042428 0.195960124 0.2424668 0.00549368   AA
#> 7  0.6119642 0.06430763 0.3012005 0.368583605 0.0419986 0.89208593   AA
#> 8  0.7708253 0.12529113 0.8712155 0.243627936 0.3246351 0.95558587   AA
#> 9  0.1446842 0.72507692 0.9396525 0.401439661 0.5583318 0.10795227   AA
#> 10 0.5119219 0.34904416 0.6956383 0.044357371 0.9120155 0.59692495   AC
#>    top2 top3  isAA  isAB  isAC  isBA  isBB  isBC
#> 1    BA   BB  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 2    BC   AA  TRUE  TRUE FALSE FALSE FALSE  TRUE
#> 3    AC   BA  TRUE FALSE  TRUE  TRUE FALSE FALSE
#> 4    BB   BA  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 5    AB   BC  TRUE  TRUE FALSE FALSE FALSE  TRUE
#> 6    AB   BA  TRUE  TRUE FALSE  TRUE FALSE FALSE
#> 7    BA   BB  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 8    BB   BA  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> 9    AC   BA  TRUE FALSE  TRUE  TRUE FALSE FALSE
#> 10   AB   AB FALSE  TRUE  TRUE FALSE FALSE FALSE

nwerth · March 28, 2019, 9:25pm

When I see any problem where the column names have meaning in the code (that is, you can't just replace them with arbitrary strings), my first thought is "This is hard because it's not tidy."

And that thought's usually right. So the first step is to make it tidy, so that everything with meaning is a value in a column. Here, that's done by making a "long" version of the dataset. After that, we group everything by an ID for the original row, use vectorized comparisons, and then "re-widen" the data. Joining back to the original gives us the other columns we've dropped.

library(dplyr)
library(tidyr)
library(tidyselect)

df <- data.frame( AA = runif(10), AB = runif(10), AC = runif(10),
                  BA = runif(10), BB = runif(10), BC = runif(10),
                  top1 = c("AA","AB","AA","AA","AA","AA","AA","AA","AA","AC"),
                  top2 = c("BA","BC","AC","BB","AB","AB","BA","BB","AC","AB"),
                  top3 = c("BB","AA","BA","BA","BC","BA","BB","BA","BA","AB"),
                  stringsAsFactors = FALSE
)
df[["id"]] <- seq_len(nrow(df))

df_long <- df %>%
  gather(
    key = "top_n",
    value = "column",
    top1, top2, top3
  ) %>%
  crossing(is_var = paste0("is", unique(.[["column"]]))) %>%
  mutate(is_value = endsWith(is_var, column))

head(df_long)
#          AA        AB        AC        BA         BB        BC id top_n column is_var is_value
# 1 0.4309631 0.7899621 0.1736746 0.8331289 0.03539516 0.7185743  1  top1     AA   isAA     TRUE
# 2 0.4309631 0.7899621 0.1736746 0.8331289 0.03539516 0.7185743  1  top1     AA   isAB    FALSE
# 3 0.4309631 0.7899621 0.1736746 0.8331289 0.03539516 0.7185743  1  top1     AA   isAC    FALSE
# 4 0.4309631 0.7899621 0.1736746 0.8331289 0.03539516 0.7185743  1  top1     AA   isBA    FALSE
# 5 0.4309631 0.7899621 0.1736746 0.8331289 0.03539516 0.7185743  1  top1     AA   isBB    FALSE
# 6 0.4309631 0.7899621 0.1736746 0.8331289 0.03539516 0.7185743  1  top1     AA   isBC    FALSE

df_long %>%
  group_by_at(unique(c("id", "is_var", .[["column"]]))) %>%
  summarise(is_value = any(is_value)) %>%
  mutate(is_value = as.integer(is_value)) %>%
  spread(key = "is_var", value = "is_value") %>%
  left_join(
    df[, c("id", "top1", "top2", "top3")],
    by = "id"
  )
# # A tibble: 10 x 16
# # Groups:   id, AA, AB, AC, BA, BC [?]
#       id    AA    AB    AC     BA     BC     BB  isAA  isAB  isAC  isBA  isBB  isBC top1  top2  top3 
#    <int> <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl> <int> <int> <int> <int> <int> <int> <chr> <chr> <chr>
#  1     1 0.431 0.790 0.174 0.833  0.719  0.0354     1     0     0     1     1     0 AA    BA    BB   
#  2     2 0.880 0.958 0.428 0.567  0.0895 0.492      1     1     0     0     0     1 AB    BC    AA   
#  3     3 0.886 0.193 0.556 0.682  0.354  0.221      1     0     1     1     0     0 AA    AC    BA   
#  4     4 0.432 0.380 0.171 0.0643 0.794  0.969      1     0     0     1     1     0 AA    BB    BA   
#  5     5 0.564 0.830 0.438 0.218  0.872  0.171      1     1     0     0     0     1 AA    AB    BC   
#  6     6 0.397 0.727 0.601 0.732  0.208  0.730      1     1     0     1     0     0 AA    AB    BA   
#  7     7 0.167 0.323 0.346 0.968  0.812  0.394      1     0     0     1     1     0 AA    BA    BB   
#  8     8 0.331 0.911 0.985 0.570  0.543  0.632      1     0     0     1     1     0 AA    BB    BA   
#  9     9 0.887 0.914 0.346 0.0320 0.429  0.950      1     0     1     1     0     0 AA    AC    BA   
# 10    10 0.758 0.261 0.360 0.372  0.0492 0.0228     0     1     1     0     0     0 AC    AB    AB   ```

jonspring · March 28, 2019, 9:29pm

Here's another approach, a little longer.

library(tidyverse)
df %>%
  # convert to long form, noting orig row
  mutate(row = row_number()) %>%
  select(row, top1:top3) %>%
  gather(col, val, -row) %>%

  # capture presence as TRUE and spread wide
  count(row, val) %>%
  mutate(val = paste0("is", val),
         n   = n > 0) %>%
  spread(val, n, fill = FALSE)

Output, which could be appended to original with bind_cols:

# A tibble: 10 x 7
     row isAA  isAB  isAC  isBA  isBB  isBC 
   <int> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
 1     1 TRUE  FALSE FALSE TRUE  TRUE  FALSE
 2     2 TRUE  TRUE  FALSE FALSE FALSE TRUE 
 3     3 TRUE  FALSE TRUE  TRUE  FALSE FALSE
 4     4 TRUE  FALSE FALSE TRUE  TRUE  FALSE
 5     5 TRUE  TRUE  FALSE FALSE FALSE TRUE 
 6     6 TRUE  TRUE  FALSE TRUE  FALSE FALSE
 7     7 TRUE  FALSE FALSE TRUE  TRUE  FALSE
 8     8 TRUE  FALSE FALSE TRUE  TRUE  FALSE
 9     9 TRUE  FALSE TRUE  TRUE  FALSE FALSE
10    10 FALSE TRUE  TRUE  FALSE FALSE FALSE

AbhishekHP · March 29, 2019, 8:28am

Thanks for fastest solution @FJCC

AbhishekHP · March 29, 2019, 8:31am

Thanks for detailed solution @andresrcs.

I was wondering it would have been more tidy if if there a way without comparing each time == "AA", == "AB"....!

AbhishekHP · March 29, 2019, 8:32am

thanks @nwerth and @jonspring . Your solution is much more tidy as we dont need to compare each column 1 by 1 : == "AA", == "AB"

AbhishekHP · March 29, 2019, 8:33am

Thanks for suggestion and Andres provided expected detailed solution

system · April 5, 2019, 8:33am

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.