Tibble split row by keywords into two columns

I am new to RStudio and learning how to perform text mining. I have a CSV file that contains a single column of comma delimited strings which shows a customer's likes and dislikes. For example: "Likes: apples, pears Dislikes: bananas" , "Dislikes: nuts, Likes: peaches"

What is a good way to split each row into two columns based on Likes and Dislikes?

Tricky problem because the order of Likes and Dislikes isn't fixed. I managed to solve it with a two-step approach.

library(tidyverse)

df <- tibble(string = c("Likes: apples, pears Dislikes: bananas" , "Dislikes: nuts, Likes: peaches"))

likes_first <- filter(df, str_detect(string, "^Likes"))
dislikes_first <- filter(df, str_detect(string, "^Dislikes"))

likes_first <- likes_first %>% 
  extract(string, into = c("cat1", "val1", "cat2", "val2"), regex = "(Likes):?\\s(.*)?\\s(Dislikes):?\\s(.*)")

dislikes_first <- dislikes_first %>% 
  extract(string, into = c("cat2", "val2", "cat1", "val1"), regex = "(Dislikes):?\\s(.*)?\\s(Likes):?\\s(.*)")

bind_rows(likes_first, dislikes_first)
#> # A tibble: 2 x 4
#>   cat1  val1          cat2     val2   
#>   <chr> <chr>         <chr>    <chr>  
#> 1 Likes apples, pears Dislikes bananas
#> 2 Likes peaches       Dislikes nuts,

Created on 2020-07-03 by the reprex package (v0.3.0)

2 Likes

Hi @Mikepat,

That was tricky - Nice :+1:

# Load libraries ----------------------------------------------------------
library("tidyverse")


# Set example data --------------------------------------------------------
my_data <- tribble(
  ~id, ~preferences,
  "C1", "Likes: apples, pears Dislikes: bananas",
  "C2", "Dislikes: nuts, Likes: peaches",
  "C3", "Likes: oranges, bananas, pears Dislikes: carrots",
  "C4", "Dislikes: cucumbers, Likes: carrots"
)


# Wrangle data ------------------------------------------------------------

# String clean up
my_data <- my_data %>% 
  mutate(preferences_clean = str_replace_all(preferences, "\\W+", " ") %>% 
           str_to_lower %>%
           str_squish)

# Extract target strings
my_data <- my_data %>% 
  mutate(likes = case_when(str_detect(preferences_clean, "^likes") ~ 
                             str_match(preferences_clean, "likes (.+) dislikes")[,2],
                           str_detect(preferences_clean, "^dislikes") ~ 
                             str_match(preferences_clean, "dislikes .+ likes (.+)")[,2]),
         dislikes = case_when(str_detect(preferences_clean, "^dislikes") ~ 
                             str_match(preferences_clean, "dislikes (.+) likes")[,2],
                           str_detect(preferences_clean, "^likes") ~ 
                             str_match(preferences_clean, "likes .+ dislikes (.+)")[,2]))

# Create tidy data: Get individual likings and convert to long
my_data_long <- my_data %>% 
  select(id, likes, dislikes) %>% 
  separate(likes, into = str_c("like_", 1:10)) %>% 
  separate(dislikes, into = str_c("dislike_", 1:10)) %>% 
  pivot_longer(cols = -id, names_to = "liking", values_to = "object") %>% 
  drop_na(object) %>% 
  mutate(liking = str_replace(liking, "_\\d+", ""))

# Or convert to wide
my_data_wide <- my_data_long %>%
  pivot_wider(id_cols = id, names_from = object, values_from = liking)

Yielding

> my_data
# A tibble: 4 x 5
  id    preferences                                      preferences_clean                            likes                 dislikes 
  <chr> <chr>                                            <chr>                                        <chr>                 <chr>    
1 C1    Likes: apples, pears Dislikes: bananas           likes apples pears dislikes bananas          apples pears          bananas  
2 C2    Dislikes: nuts, Likes: peaches                   dislikes nuts likes peaches                  peaches               nuts     
3 C3    Likes: oranges, bananas, pears Dislikes: carrots likes oranges bananas pears dislikes carrots oranges bananas pears carrots  
4 C4    Dislikes: cucumbers, Likes: carrots              dislikes cucumbers likes carrots             carrots               cucumbers
> my_data_long
# A tibble: 11 x 3
   id    liking  object   
   <chr> <chr>   <chr>    
 1 C1    like    apples   
 2 C1    like    pears    
 3 C1    dislike bananas  
 4 C2    like    peaches  
 5 C2    dislike nuts     
 6 C3    like    oranges  
 7 C3    like    bananas  
 8 C3    like    pears    
 9 C3    dislike carrots  
10 C4    like    carrots  
11 C4    dislike cucumbers
> my_data_wide
# A tibble: 4 x 9
  id    apples pears bananas peaches nuts    oranges carrots cucumbers
  <chr> <chr>  <chr> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>    
1 C1    like   like  dislike NA      NA      NA      NA      NA       
2 C2    NA     NA    NA      like    dislike NA      NA      NA       
3 C3    NA     like  like    NA      NA      like    dislike NA       
4 C4    NA     NA    NA      NA      NA      NA      like    dislike  

Hope it helps :slightly_smiling_face:

2 Likes

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.