Creating multiple columns based on set of column values and then creating new values for columns

Hello,

I have this table df1 and from it I want to create df2. The new columns for df2 should be created based on the values that are present in all columns which contain the names col. Thereafter, each test column should compare against all the columns and if its name is present score it as a 1 and not a 0. I would like a base R approach or a non transpose approach.

  df1 <- 
  data.frame(
  stringsAsFactors = FALSE,
              Col1 = c("Test1", "Test1", "Test2", "Test2", "Test7"),
              Col2 = c(NA, "Test3", "Test1", "Test2", NA),
              Col3 = c(NA, "Test2", NA, NA, NA)
  )

  df2 <- 
  data.frame(
  stringsAsFactors = FALSE,
              Col1 = c("Test1", "Test1", "Test2", "Test2", "Test7"),
              Col2 = c(NA, "Test3", "Test1", "Test2", NA),
              Col3 = c(NA, "Test2", NA, NA, NA),
             Test1 = c(1L, 1L, 1L, 0L, 0L),
             Test2 = c(0L, 1L, 1L, 1L, 0L),
             Test3 = c(0L, 1L, 0L, 0L, 0L),
             Test7 = c(0L, 0L, 0L, 0L, 1L)
  )

I am sure there is a more elegant solution.

df1 <- 
  data.frame(
    stringsAsFactors = FALSE,
    Col1 = c("Test1", "Test1", "Test2", "Test2", "Test7"),
    Col2 = c(NA, "Test3", "Test1", "Test2", NA),
    Col3 = c(NA, "Test2", NA, NA, NA)
  )
tmp <- df1[, grepl("Col", colnames(df1))]
tmp2 <- unique(unlist((tmp)))
tmp2 <- tmp2[!is.na(tmp2)]

myFunc <- function(Nm, DF) {
  rowCheck <- function(Row) {
    as.numeric(any(grepl(Nm, Row)))
  }
  DF[[Nm]] <- apply(DF, MARGIN = 1, rowCheck)
  DF
}
for(NAME in tmp2) {
  df1 <- myFunc(NAME, df1)
}
df1
#>    Col1  Col2  Col3 Test1 Test2 Test7 Test3
#> 1 Test1  <NA>  <NA>     1     0     0     0
#> 2 Test1 Test3 Test2     1     1     0     1
#> 3 Test2 Test1  <NA>     1     1     0     0
#> 4 Test2 Test2  <NA>     0     1     0     0
#> 5 Test7  <NA>  <NA>     0     0     1     0

Created on 2021-04-22 by the reprex package (v0.3.0)

This is the best solution I've came up with but I don't like the for into sapply. I just want that section better or more cleaner. Any ideas?


library(tidyverse)


df <- 
  data.frame(
    stringsAsFactors = FALSE,
    Col1 = c("Test1", "Test1", "Test2", "Test2", "Test7"),
    Col2 = c(NA, "Test3", "Test1", "Test2", NA),
    Col3 = c(NA, "Test2", NA, NA, NA)
  )

output <- 
  df %>% 
  unlist() %>% 
  unique() %>% 
  na.omit() %>% 
  sort()

listy <- list()

for(j in 1:nrow(df)){
  listy[[j]] <- sapply(output,function(x) {ifelse(x %in% df[j,],1,0)})
}

df_final <- 
  cbind(df,do.call(rbind,listy))

df_final
#>    Col1  Col2  Col3 Test1 Test2 Test3 Test7
#> 1 Test1  <NA>  <NA>     1     0     0     0
#> 2 Test1 Test3 Test2     1     1     1     0
#> 3 Test2 Test1  <NA>     1     1     0     0
#> 4 Test2 Test2  <NA>     0     1     0     0
#> 5 Test7  <NA>  <NA>     0     0     0     1

Created on 2021-04-22 by the reprex package (v0.3.0)

Alrighty, I think this is the cleanest way :slight_smile:

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'dplyr' was built under R version 4.0.5

df <-
  data.frame(
    stringsAsFactors = FALSE,
    Col1 = c("Test1", "Test1", "Test2", "Test2", "Test7"),
    Col2 = c(NA, "Test3", "Test1", "Test2", NA),
    Col3 = c(NA, "Test2", NA, NA, NA)
  )

test_values <-
  df %>%
  unlist() %>%
  unique() %>%
  na.omit() %>%
  sort()

tested_list <-
  apply(df, 1, function(x) {
    sapply(test_values, function(y) {
      ifelse(y %in% x, 1, 0)
    }, simplify = FALSE)
  })

df_final <-
  cbind(
    df,
    do.call(rbind, tested_list)
  )


df_final
#>    Col1  Col2  Col3 Test1 Test2 Test3 Test7
#> 1 Test1  <NA>  <NA>     1     0     0     0
#> 2 Test1 Test3 Test2     1     1     1     0
#> 3 Test2 Test1  <NA>     1     1     0     0
#> 4 Test2 Test2  <NA>     0     1     0     0
#> 5 Test7  <NA>  <NA>     0     0     0     1

Created on 2021-04-22 by the reprex package (v0.3.0)

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.

If you have a query related to it or one of the replies, start a new topic and refer back with a link.